skypilot-nightly 1.0.0.dev20250615__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/cloud_vm_ray_backend.py +43 -60
  3. sky/cli.py +55 -637
  4. sky/client/cli.py +55 -637
  5. sky/clouds/kubernetes.py +3 -0
  6. sky/clouds/scp.py +7 -26
  7. sky/clouds/utils/scp_utils.py +177 -124
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  10. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  11. sky/dashboard/out/_next/static/{R07f8gwfXT1U0zRznq4Lg → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
  12. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  13. sky/dashboard/out/clusters/[cluster].html +1 -1
  14. sky/dashboard/out/clusters.html +1 -1
  15. sky/dashboard/out/config.html +1 -1
  16. sky/dashboard/out/index.html +1 -1
  17. sky/dashboard/out/infra/[context].html +1 -1
  18. sky/dashboard/out/infra.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs.html +1 -1
  21. sky/dashboard/out/users.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/jobs/controller.py +98 -31
  26. sky/jobs/scheduler.py +37 -29
  27. sky/jobs/server/core.py +36 -3
  28. sky/jobs/state.py +69 -9
  29. sky/jobs/utils.py +11 -0
  30. sky/provision/__init__.py +1 -0
  31. sky/provision/scp/__init__.py +15 -0
  32. sky/provision/scp/config.py +93 -0
  33. sky/provision/scp/instance.py +528 -0
  34. sky/resources.py +164 -29
  35. sky/skylet/constants.py +39 -0
  36. sky/skylet/job_lib.py +8 -0
  37. sky/task.py +171 -21
  38. sky/templates/kubernetes-ray.yml.j2 +51 -4
  39. sky/templates/scp-ray.yml.j2 +3 -50
  40. sky/users/permission.py +19 -36
  41. sky/utils/command_runner.py +1 -1
  42. sky/utils/common_utils.py +16 -14
  43. sky/utils/context.py +1 -1
  44. sky/utils/controller_utils.py +12 -3
  45. sky/utils/dag_utils.py +17 -4
  46. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  47. sky/utils/schemas.py +43 -5
  48. {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
  49. {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
  50. sky/benchmark/__init__.py +0 -0
  51. sky/benchmark/benchmark_state.py +0 -295
  52. sky/benchmark/benchmark_utils.py +0 -641
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  55. sky/skylet/providers/scp/__init__.py +0 -2
  56. sky/skylet/providers/scp/config.py +0 -149
  57. sky/skylet/providers/scp/node_provider.py +0 -578
  58. /sky/dashboard/out/_next/static/{R07f8gwfXT1U0zRznq4Lg → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
  59. {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
  60. {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
  61. {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
  62. {skypilot_nightly-1.0.0.dev20250615.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0
sky/task.py CHANGED
@@ -121,27 +121,61 @@ def _fill_in_env_vars(
121
121
  return json.loads(yaml_field_str)
122
122
 
123
123
 
124
- def _check_docker_login_config(task_envs: Dict[str, str]) -> bool:
125
- """Checks if there is a valid docker login config in task_envs.
124
+ def _check_docker_login_config(task_envs: Dict[str, str],
125
+ task_secrets: Dict[str, str]) -> bool:
126
+ """Validates a valid docker login config in task_envs and task_secrets.
126
127
 
127
- If any of the docker login env vars is set, all of them must be set.
128
+ Docker login variables must be specified together either in envs OR secrets,
129
+ not split across both. If any of the docker login env vars is set, all of
130
+ them must be set in the same location.
131
+
132
+ Args:
133
+ task_envs: Environment variables
134
+ task_secrets: Secret variables (optional, defaults to empty dict)
128
135
 
129
136
  Returns:
130
- True if there is a valid docker login config in task_envs.
137
+ True if there is a valid docker login config.
131
138
  False otherwise.
132
139
  Raises:
133
- ValueError: if any of the docker login env vars is set, but not all of
134
- them are set.
140
+ ValueError: if docker login configuration is invalid.
135
141
  """
142
+ if task_secrets is None:
143
+ task_secrets = {}
144
+
136
145
  all_keys = constants.DOCKER_LOGIN_ENV_VARS
137
- existing_keys = all_keys & set(task_envs.keys())
138
- if not existing_keys:
146
+ envs_keys = all_keys & set(task_envs.keys())
147
+ secrets_keys = all_keys & set(task_secrets.keys())
148
+
149
+ # Check if any docker variables exist
150
+ if not envs_keys and not secrets_keys:
139
151
  return False
140
- if len(existing_keys) != len(all_keys):
152
+
153
+ # Check if variables are split across envs and secrets
154
+ if envs_keys and secrets_keys:
141
155
  with ux_utils.print_exception_no_traceback():
142
156
  raise ValueError(
143
- f'If any of {", ".join(all_keys)} is set, all of them must '
144
- f'be set. Missing envs: {all_keys - existing_keys}')
157
+ 'Docker login variables must be specified together either '
158
+ 'in envs OR secrets, not split across both. '
159
+ f'Found in envs: {sorted(envs_keys)}, '
160
+ f'Found in secrets: {sorted(secrets_keys)}')
161
+
162
+ # Check if all variables are present in the chosen location
163
+ if envs_keys:
164
+ if len(envs_keys) != len(all_keys):
165
+ with ux_utils.print_exception_no_traceback():
166
+ raise ValueError(
167
+ 'Docker login variables must be specified together '
168
+ 'in envs. '
169
+ f'Missing from envs: {sorted(all_keys - envs_keys)}')
170
+
171
+ if secrets_keys:
172
+ if len(secrets_keys) != len(all_keys):
173
+ with ux_utils.print_exception_no_traceback():
174
+ raise ValueError(
175
+ 'Docker login variables must be specified together '
176
+ 'in secrets. '
177
+ f'Missing from secrets: {sorted(all_keys - secrets_keys)}')
178
+
145
179
  return True
146
180
 
147
181
 
@@ -149,11 +183,13 @@ def _with_docker_login_config(
149
183
  resources: Union[Set['resources_lib.Resources'],
150
184
  List['resources_lib.Resources']],
151
185
  task_envs: Dict[str, str],
186
+ task_secrets: Dict[str, str],
152
187
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
153
- if not _check_docker_login_config(task_envs):
188
+ if not _check_docker_login_config(task_envs, task_secrets):
154
189
  return resources
155
- docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(
156
- task_envs)
190
+ envs = task_envs.copy()
191
+ envs.update(task_secrets)
192
+ docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)
157
193
 
158
194
  def _add_docker_login_config(resources: 'resources_lib.Resources'):
159
195
  docker_image = resources.extract_docker_image()
@@ -181,8 +217,11 @@ def _with_docker_username_for_runpod(
181
217
  resources: Union[Set['resources_lib.Resources'],
182
218
  List['resources_lib.Resources']],
183
219
  task_envs: Dict[str, str],
220
+ task_secrets: Dict[str, str],
184
221
  ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
185
- docker_username_for_runpod = task_envs.get(
222
+ envs = task_envs.copy()
223
+ envs.update(task_secrets)
224
+ docker_username_for_runpod = envs.get(
186
225
  constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)
187
226
 
188
227
  # We should not call r.copy() if docker_username_for_runpod is None,
@@ -204,6 +243,7 @@ class Task:
204
243
  setup: Optional[str] = None,
205
244
  run: Optional[CommandOrCommandGen] = None,
206
245
  envs: Optional[Dict[str, str]] = None,
246
+ secrets: Optional[Dict[str, str]] = None,
207
247
  workdir: Optional[str] = None,
208
248
  num_nodes: Optional[int] = None,
209
249
  # Advanced:
@@ -254,6 +294,9 @@ class Task:
254
294
  self-contained lambda.
255
295
  envs: A dictionary of environment variables to set before running the
256
296
  setup and run commands.
297
+ secrets: A dictionary of secret environment variables to set before
298
+ running the setup and run commands. These will be redacted in logs
299
+ and YAML output.
257
300
  workdir: The local working directory. This directory will be synced
258
301
  to a location on the remote VM(s), and ``setup`` and ``run``
259
302
  commands will be run under that location (thus, they can rely on
@@ -275,6 +318,13 @@ class Task:
275
318
  storage_lib.StoreType] = {}
276
319
  self.setup = setup
277
320
  self._envs = envs or {}
321
+ self._secrets = secrets or {}
322
+
323
+ # Validate Docker login configuration early if both envs and secrets
324
+ # contain Docker variables
325
+ if self._envs or self._secrets:
326
+ _check_docker_login_config(self._envs, self._secrets)
327
+
278
328
  self.workdir = workdir
279
329
  self.docker_image = (docker_image if docker_image else
280
330
  'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04')
@@ -447,6 +497,7 @@ class Task:
447
497
  def from_yaml_config(
448
498
  config: Dict[str, Any],
449
499
  env_overrides: Optional[List[Tuple[str, str]]] = None,
500
+ secrets_overrides: Optional[List[Tuple[str, str]]] = None,
450
501
  ) -> 'Task':
451
502
  # More robust handling for 'envs': explicitly convert keys and values to
452
503
  # str, since users may pass '123' as keys/values which will get parsed
@@ -460,6 +511,20 @@ class Task:
460
511
  else:
461
512
  new_envs[str(k)] = None
462
513
  config['envs'] = new_envs
514
+
515
+ # More robust handling for 'secrets': explicitly convert keys and values
516
+ # to str, since users may pass '123' as keys/values which will get
517
+ # parsed as int causing validate_schema() to fail.
518
+ secrets = config.get('secrets')
519
+ if secrets is not None and isinstance(secrets, dict):
520
+ new_secrets: Dict[str, Optional[str]] = {}
521
+ for k, v in secrets.items():
522
+ if v is not None:
523
+ new_secrets[str(k)] = str(v)
524
+ else:
525
+ new_secrets[str(k)] = None
526
+ config['secrets'] = new_secrets
527
+
463
528
  common_utils.validate_schema(config, schemas.get_task_schema(),
464
529
  'Invalid task YAML: ')
465
530
  if env_overrides is not None:
@@ -473,6 +538,12 @@ class Task:
473
538
  new_envs.update(env_overrides)
474
539
  config['envs'] = new_envs
475
540
 
541
+ if secrets_overrides is not None:
542
+ # Override secrets vars from CLI.
543
+ new_secrets = config.get('secrets', {})
544
+ new_secrets.update(secrets_overrides)
545
+ config['secrets'] = new_secrets
546
+
476
547
  for k, v in config.get('envs', {}).items():
477
548
  if v is None:
478
549
  with ux_utils.print_exception_no_traceback():
@@ -482,6 +553,15 @@ class Task:
482
553
  f'To set it to be empty, use an empty string ({k}: "" '
483
554
  f'in task YAML or --env {k}="" in CLI).')
484
555
 
556
+ for k, v in config.get('secrets', {}).items():
557
+ if v is None:
558
+ with ux_utils.print_exception_no_traceback():
559
+ raise ValueError(
560
+ f'Secret variable {k!r} is None. Please set a '
561
+ 'value for it in task YAML or with --secret flag. '
562
+ f'To set it to be empty, use an empty string ({k}: "" '
563
+ f'in task YAML or --secret {k}="" in CLI).')
564
+
485
565
  # Fill in any Task.envs into file_mounts (src/dst paths, storage
486
566
  # name/source).
487
567
  if config.get('file_mounts') is not None:
@@ -505,6 +585,7 @@ class Task:
505
585
  setup=config.pop('setup', None),
506
586
  num_nodes=config.pop('num_nodes', None),
507
587
  envs=config.pop('envs', None),
588
+ secrets=config.pop('secrets', None),
508
589
  event_callback=config.pop('event_callback', None),
509
590
  file_mounts_mapping=config.pop('file_mounts_mapping', None),
510
591
  )
@@ -687,6 +768,10 @@ class Task:
687
768
  def envs(self) -> Dict[str, str]:
688
769
  return self._envs
689
770
 
771
+ @property
772
+ def secrets(self) -> Dict[str, str]:
773
+ return self._secrets
774
+
690
775
  def update_envs(
691
776
  self, envs: Union[None, List[Tuple[str, str]],
692
777
  Dict[str, str]]) -> 'Task':
@@ -727,17 +812,70 @@ class Task:
727
812
  # If the update_envs() is called after set_resources(), we need to
728
813
  # manually update docker login config in task resources, in case the
729
814
  # docker login envs are newly added.
730
- if _check_docker_login_config(self._envs):
815
+ if _check_docker_login_config(self._envs, self._secrets):
731
816
  self.resources = _with_docker_login_config(self.resources,
732
- self._envs)
817
+ self._envs,
818
+ self._secrets)
733
819
  self.resources = _with_docker_username_for_runpod(
734
- self.resources, self._envs)
820
+ self.resources, self._envs, self._secrets)
821
+ return self
822
+
823
+ def update_secrets(
824
+ self, secrets: Union[None, List[Tuple[str, str]],
825
+ Dict[str, str]]) -> 'Task':
826
+ """Updates secret env vars for use inside the setup/run commands.
827
+
828
+ Args:
829
+ secrets: (optional) either a list of ``(secret_name, value)`` or a
830
+ dict ``{secret_name: value}``.
831
+
832
+ Returns:
833
+ self: The current task, with secrets updated.
834
+
835
+ Raises:
836
+ ValueError: if various invalid inputs errors are detected.
837
+ """
838
+ if secrets is None:
839
+ secrets = {}
840
+ if isinstance(secrets, (list, tuple)):
841
+ keys = set(secret[0] for secret in secrets)
842
+ if len(keys) != len(secrets):
843
+ with ux_utils.print_exception_no_traceback():
844
+ raise ValueError('Duplicate secret keys provided.')
845
+ secrets = dict(secrets)
846
+ if isinstance(secrets, dict):
847
+ for key in secrets:
848
+ if not isinstance(key, str):
849
+ with ux_utils.print_exception_no_traceback():
850
+ raise ValueError('Secret keys must be strings.')
851
+ if not common_utils.is_valid_env_var(key):
852
+ with ux_utils.print_exception_no_traceback():
853
+ raise ValueError(f'Invalid secret key: {key}')
854
+ else:
855
+ with ux_utils.print_exception_no_traceback():
856
+ raise ValueError(
857
+ 'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
858
+ f'{secrets}')
859
+ self._secrets.update(secrets)
860
+ # Validate Docker login configuration if needed
861
+ if _check_docker_login_config(self._envs, self._secrets):
862
+ self.resources = _with_docker_login_config(self.resources,
863
+ self._envs,
864
+ self._secrets)
865
+ self.resources = _with_docker_username_for_runpod(
866
+ self.resources, self._envs, self._secrets)
735
867
  return self
736
868
 
737
869
  @property
738
870
  def use_spot(self) -> bool:
739
871
  return any(r.use_spot for r in self.resources)
740
872
 
873
+ @property
874
+ def envs_and_secrets(self) -> Dict[str, str]:
875
+ envs = self.envs.copy()
876
+ envs.update(self.secrets)
877
+ return envs
878
+
741
879
  def set_inputs(self, inputs: str,
742
880
  estimated_size_gigabytes: float) -> 'Task':
743
881
  # E.g., 's3://bucket', 'gs://bucket', or None.
@@ -796,10 +934,11 @@ class Task:
796
934
  if isinstance(resources, sky.Resources):
797
935
  resources = {resources}
798
936
  # TODO(woosuk): Check if the resources are None.
799
- self.resources = _with_docker_login_config(resources, self.envs)
937
+ self.resources = _with_docker_login_config(resources, self.envs,
938
+ self.secrets)
800
939
  # Only have effect on RunPod.
801
940
  self.resources = _with_docker_username_for_runpod(
802
- self.resources, self.envs)
941
+ self.resources, self.envs, self.secrets)
803
942
 
804
943
  # Evaluate if the task requires FUSE and set the requires_fuse flag
805
944
  for _, storage_obj in self.storage_mounts.items():
@@ -1266,7 +1405,7 @@ class Task:
1266
1405
  d[k] = v
1267
1406
  return d
1268
1407
 
1269
- def to_yaml_config(self) -> Dict[str, Any]:
1408
+ def to_yaml_config(self, redact_secrets: bool = True) -> Dict[str, Any]:
1270
1409
  """Returns a yaml-style dict representation of the task.
1271
1410
 
1272
1411
  INTERNAL: this method is internal-facing.
@@ -1314,8 +1453,19 @@ class Task:
1314
1453
  add_if_not_none('workdir', self.workdir)
1315
1454
  add_if_not_none('event_callback', self.event_callback)
1316
1455
  add_if_not_none('run', self.run)
1456
+
1457
+ # Add envs without redaction
1317
1458
  add_if_not_none('envs', self.envs, no_empty=True)
1318
1459
 
1460
+ # Add secrets with redaction if requested
1461
+ secrets = self.secrets
1462
+ if secrets and redact_secrets:
1463
+ secrets = {
1464
+ k: '<redacted>' if isinstance(v, str) else v
1465
+ for k, v in secrets.items()
1466
+ }
1467
+ add_if_not_none('secrets', secrets, no_empty=True)
1468
+
1319
1469
  add_if_not_none('file_mounts', {})
1320
1470
 
1321
1471
  if self.file_mounts is not None:
@@ -632,19 +632,66 @@ available_node_types:
632
632
  {% if high_availability %}
633
633
  mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
634
634
  if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
635
+ SKYPILOT_HA_RECOVERY_LOG="/tmp/ha_recovery.log"
636
+ echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
637
+ start_time=$SECONDS
638
+ retry_count=0
639
+
640
+ # Wait for Ray to be ready, as the following commands is depending on Ray.
641
+ GET_RAY_STATUS_CMD=$({{sky_python_cmd}} -c 'from sky.provision import instance_setup; print(instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND)')
642
+ while true; do
643
+ retry_count=$((retry_count + 1))
644
+ current_duration=$(( SECONDS - start_time ))
645
+ echo "Attempt $retry_count to get Ray status after $current_duration seconds..." >> $SKYPILOT_HA_RECOVERY_LOG
646
+
647
+ bash --login -c "$GET_RAY_STATUS_CMD"
648
+ if [ $? -eq 0 ]; then
649
+ wait_duration=$(( SECONDS - start_time ))
650
+ echo "Ray ready after waiting $wait_duration seconds (took $retry_count attempts)" >> $SKYPILOT_HA_RECOVERY_LOG
651
+ break
652
+ fi
653
+ echo "Waiting for Ray to be ready..." >> $SKYPILOT_HA_RECOVERY_LOG
654
+ sleep 2
655
+ done
656
+
635
657
  # ! Keep this aligned with `CloudVmRayBackend._setup()`
636
- # Suppose all `task.setup` are the same for skyserve controller task.
658
+ # Suppose all `task.setup` are the same for sky serve / managed jobs controller task.
637
659
  # So be careful for compatibility issue once you change it.
638
660
  chmod +x {{k8s_high_availability_deployment_setup_script_path}}
639
661
  /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
640
- echo "=== Controller setup commands completed for recovery ==="
641
-
662
+ echo "=== Controller setup commands completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
663
+
664
+ touch {{k8s_high_availability_restarting_signal_file}}
665
+ # Get all in-progress jobs from managed jobs controller. We skip any jobs that are already done.
666
+ # Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
667
+ # For SkyServe, this will be None and every service will be recovered. This is because SkyServe
668
+ # will delete the service from the database after it is terminated so everything in the database is running.
669
+ ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.get_managed_jobs(); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
670
+ if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
671
+ read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
672
+ fi
642
673
  for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
674
+ # This is the cluster job id on managed jobs controller, but it is guaranteed to be the same as the managed job id,
675
+ # so we directly use it here. See `CloudVmRayBackend._exec_code_on_head::_dump_code_to_file` for more details.
676
+ JOB_ID=$(basename $file | sed 's/sky_job_//')
677
+ # If the list of in-progress jobs is not None (meaning this is a managed job HA controller) and job is not in-progress, skip.
678
+ if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
679
+ if [[ ! " ${ALL_IN_PROGRESS_JOBS_SEQ[@]} " =~ " ${JOB_ID} " ]]; then
680
+ continue
681
+ fi
682
+ fi
643
683
  # ! Keep this aligned with `CloudVmRayBackend._execute()`
644
684
  chmod +x $file
685
+ # TODO(tian): This logic may run a lot of things if the jobs controller previously had many jobs.
686
+ # We should do more tests and make sure it will scale well.
645
687
  /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
646
- echo "=== Controller task run for service (file: $file) completed for recovery ==="
688
+ echo "=== Controller task run for service / job (file: $file) completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
647
689
  done
690
+ rm {{k8s_high_availability_restarting_signal_file}}
691
+
692
+ duration=$(( SECONDS - start_time ))
693
+ echo "HA recovery completed at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
694
+ echo "Total recovery time: $duration seconds" >> $SKYPILOT_HA_RECOVERY_LOG
648
695
  fi
649
696
 
650
697
  touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
@@ -7,7 +7,7 @@ idle_timeout_minutes: 60
7
7
 
8
8
  provider:
9
9
  type: external
10
- module: sky.skylet.providers.scp.SCPNodeProvider
10
+ module: sky.provision.scp
11
11
  region: {{region}}
12
12
  cache_stopped_nodes: True
13
13
 
@@ -24,19 +24,6 @@ available_node_types:
24
24
  InstanceType: {{instance_type}}
25
25
  imageId: {{image_id}}
26
26
  diskSize: {{disk_size}}
27
- {% if num_nodes > 1 %}
28
- ray_worker_default:
29
- min_workers: {{num_nodes - 1}}
30
- max_workers: {{num_nodes - 1}}
31
- resources: {}
32
- node_config:
33
- AuthorizedKey: |
34
- skypilot:ssh_public_key_content
35
- InstanceType: {{instance_type}}
36
- imageId: {{image_id}}
37
- diskSize: {{disk_size}}
38
-
39
- {%- endif %}
40
27
 
41
28
  head_node_type: ray_head_default
42
29
 
@@ -50,10 +37,6 @@ file_mounts: {
50
37
  {%- endfor %}
51
38
  }
52
39
 
53
- rsync_exclude: []
54
-
55
- initialization_commands: []
56
-
57
40
  # List of shell commands to run to set up nodes.
58
41
  # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
59
42
  # connection, which is expensive. Try your best to co-locate commands into fewer
@@ -77,36 +60,6 @@ setup_commands:
77
60
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
78
61
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
79
62
  [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
80
- {{ ssh_max_sessions_config }}
81
-
82
- # Command to start ray on the head node. You don't need to change this.
83
- # NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
84
- # connection, which is expensive. Try your best to co-locate commands into fewer
85
- # items! The same comment applies for worker_start_ray_commands.
86
- #
87
- # Increment the following for catching performance bugs easier:
88
- # current num items (num SSH connections): 1
89
- head_start_ray_commands:
90
- # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
91
- # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires
92
- # all the sessions to be reloaded. This is a workaround.
93
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
94
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
95
- {{dump_port_command}}; {{ray_head_wait_initialized_command}}
96
-
97
- {%- if num_nodes > 1 %}
98
- worker_start_ray_commands:
99
- - {{ sky_activate_python_env }}; {{ sky_ray_cmd }} stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 {{ sky_ray_cmd }} start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
100
- which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
101
- {%- else %}
102
- worker_start_ray_commands: []
103
- {%- endif %}
104
-
105
- head_node: {}
106
- worker_nodes: {}
107
63
 
108
- # These fields are required for external cloud providers.
109
- head_setup_commands: []
110
- worker_setup_commands: []
111
- cluster_synced_files: []
112
- file_mounts_sync_continuously: False
64
+ # Command to start ray clusters are now placed in `sky.provision.instance_setup`.
65
+ # We do not need to list it here anymore.
sky/users/permission.py CHANGED
@@ -30,36 +30,28 @@ class PermissionService:
30
30
  """Permission service for SkyPilot API Server."""
31
31
 
32
32
  def __init__(self):
33
- self.enforcer = None
34
- self.init_lock = threading.Lock()
35
-
36
- def _lazy_initialize(self):
37
- if self.enforcer is not None:
38
- return
39
- with self.init_lock:
40
- if self.enforcer is not None:
41
- return
42
- global _enforcer_instance
43
- if _enforcer_instance is None:
44
- # For different threads, we share the same enforcer instance.
45
- with _lock:
46
- if _enforcer_instance is None:
47
- _enforcer_instance = self
48
- engine = global_user_state.initialize_and_get_db()
49
- adapter = sqlalchemy_adapter.Adapter(engine)
50
- model_path = os.path.join(os.path.dirname(__file__),
51
- 'model.conf')
52
- enforcer = casbin.Enforcer(model_path, adapter)
53
- self.enforcer = enforcer
54
- else:
55
- self.enforcer = _enforcer_instance.enforcer
56
- else:
57
- self.enforcer = _enforcer_instance.enforcer
58
- with _policy_lock():
59
- self._maybe_initialize_policies()
33
+ global _enforcer_instance
34
+ if _enforcer_instance is None:
35
+ # For different threads, we share the same enforcer instance.
36
+ with _lock:
37
+ if _enforcer_instance is None:
38
+ _enforcer_instance = self
39
+ engine = global_user_state.initialize_and_get_db()
40
+ adapter = sqlalchemy_adapter.Adapter(engine)
41
+ model_path = os.path.join(os.path.dirname(__file__),
42
+ 'model.conf')
43
+ enforcer = casbin.Enforcer(model_path, adapter)
44
+ self.enforcer = enforcer
45
+ else:
46
+ self.enforcer = _enforcer_instance.enforcer
47
+ else:
48
+ self.enforcer = _enforcer_instance.enforcer
49
+ with _policy_lock():
50
+ self._maybe_initialize_policies()
60
51
 
61
52
  def _maybe_initialize_policies(self) -> None:
62
53
  """Initialize policies if they don't already exist."""
54
+ # TODO(zhwu): we should avoid running this on client side.
63
55
  logger.debug(f'Initializing policies in process: {os.getpid()}')
64
56
  self._load_policy_no_lock()
65
57
 
@@ -138,7 +130,6 @@ class PermissionService:
138
130
 
139
131
  def add_user_if_not_exists(self, user_id: str) -> None:
140
132
  """Add user role relationship."""
141
- self._lazy_initialize()
142
133
  with _policy_lock():
143
134
  self._add_user_if_not_exists_no_lock(user_id)
144
135
 
@@ -158,7 +149,6 @@ class PermissionService:
158
149
 
159
150
  def update_role(self, user_id: str, new_role: str) -> None:
160
151
  """Update user role relationship."""
161
- self._lazy_initialize()
162
152
  with _policy_lock():
163
153
  # Get current roles
164
154
  self._load_policy_no_lock()
@@ -191,7 +181,6 @@ class PermissionService:
191
181
  Returns:
192
182
  A list of role names that the user has.
193
183
  """
194
- self._lazy_initialize()
195
184
  self._load_policy_no_lock()
196
185
  return self.enforcer.get_roles_for_user(user_id)
197
186
 
@@ -204,7 +193,6 @@ class PermissionService:
204
193
  # it is a hot path in every request. It is ok to have a stale policy,
205
194
  # as long as it is eventually consistent.
206
195
  # self._load_policy_no_lock()
207
- self._lazy_initialize()
208
196
  return self.enforcer.enforce(user_id, path, method)
209
197
 
210
198
  def _load_policy_no_lock(self):
@@ -213,7 +201,6 @@ class PermissionService:
213
201
 
214
202
  def load_policy(self):
215
203
  """Load policy from storage with lock."""
216
- self._lazy_initialize()
217
204
  with _policy_lock():
218
205
  self._load_policy_no_lock()
219
206
 
@@ -229,7 +216,6 @@ class PermissionService:
229
216
  For public workspaces, the permission is granted via a wildcard policy
230
217
  ('*').
231
218
  """
232
- self._lazy_initialize()
233
219
  if os.getenv(constants.ENV_VAR_IS_SKYPILOT_SERVER) is None:
234
220
  # When it is not on API server, we allow all users to access all
235
221
  # workspaces, as the workspace check has been done on API server.
@@ -257,7 +243,6 @@ class PermissionService:
257
243
  For public workspaces, this should be ['*'].
258
244
  For private workspaces, this should be specific user IDs.
259
245
  """
260
- self._lazy_initialize()
261
246
  with _policy_lock():
262
247
  for user in users:
263
248
  logger.debug(f'Adding workspace policy: user={user}, '
@@ -275,7 +260,6 @@ class PermissionService:
275
260
  For public workspaces, this should be ['*'].
276
261
  For private workspaces, this should be specific user IDs.
277
262
  """
278
- self._lazy_initialize()
279
263
  with _policy_lock():
280
264
  self._load_policy_no_lock()
281
265
  # Remove all existing policies for this workspace
@@ -289,7 +273,6 @@ class PermissionService:
289
273
 
290
274
  def remove_workspace_policy(self, workspace_name: str) -> None:
291
275
  """Remove workspace policy."""
292
- self._lazy_initialize()
293
276
  with _policy_lock():
294
277
  self.enforcer.remove_filtered_policy(1, workspace_name)
295
278
  self.enforcer.save_policy()
@@ -561,7 +561,7 @@ class SSHCommandRunner(CommandRunner):
561
561
  if self.ssh_control_name is not None:
562
562
  control_path = _ssh_control_path(self.ssh_control_name)
563
563
  if control_path is not None:
564
- # Suppress the `Exit request sent.` output for this comamnd
564
+ # Suppress the `Exit request sent.` output for this command
565
565
  # which would interrupt the CLI spinner.
566
566
  cmd = (f'ssh -O exit -S {control_path}/%C '
567
567
  f'{self.ssh_user}@{self.ip} > /dev/null 2>&1')