skypilot-nightly 1.0.0.dev20241109__py3-none-any.whl → 1.0.0.dev20241111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +0 -19
  3. sky/clouds/oci.py +11 -21
  4. sky/clouds/service_catalog/oci_catalog.py +1 -1
  5. sky/clouds/utils/oci_utils.py +16 -2
  6. sky/dag.py +19 -15
  7. sky/provision/__init__.py +1 -0
  8. sky/provision/docker_utils.py +1 -1
  9. sky/provision/kubernetes/instance.py +104 -102
  10. sky/provision/oci/__init__.py +15 -0
  11. sky/provision/oci/config.py +51 -0
  12. sky/provision/oci/instance.py +430 -0
  13. sky/{skylet/providers/oci/query_helper.py → provision/oci/query_utils.py} +148 -59
  14. sky/serve/__init__.py +2 -0
  15. sky/serve/load_balancer.py +34 -8
  16. sky/serve/load_balancing_policies.py +23 -1
  17. sky/serve/service.py +4 -1
  18. sky/serve/service_spec.py +19 -0
  19. sky/setup_files/MANIFEST.in +0 -1
  20. sky/skylet/job_lib.py +29 -17
  21. sky/templates/kubernetes-ray.yml.j2 +21 -1
  22. sky/templates/oci-ray.yml.j2 +3 -53
  23. sky/utils/schemas.py +8 -0
  24. {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/METADATA +1 -1
  25. {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/RECORD +29 -29
  26. sky/skylet/providers/oci/__init__.py +0 -2
  27. sky/skylet/providers/oci/node_provider.py +0 -488
  28. sky/skylet/providers/oci/utils.py +0 -21
  29. {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/LICENSE +0 -0
  30. {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/WHEEL +0 -0
  31. {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/entry_points.txt +0 -0
  32. {skypilot_nightly-1.0.0.dev20241109.dist-info → skypilot_nightly-1.0.0.dev20241111.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '42c79e1d0a5e018e275705ada53957573f9a0181'
8
+ _SKYPILOT_COMMIT_SHA = '91323d86baaeb1341c6953e15bbf19f2896b67ad'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241109'
38
+ __version__ = '1.0.0.dev20241111'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -3979,25 +3979,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3979
3979
  stdout = ''
3980
3980
  stderr = str(e)
3981
3981
 
3982
- # Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
3983
- # May, 2023 by Hysun: Allow terminate INIT cluster which may have
3984
- # some instances provisioning in background but not completed.
3985
- elif (isinstance(cloud, clouds.OCI) and terminate and
3986
- prev_cluster_status in (status_lib.ClusterStatus.STOPPED,
3987
- status_lib.ClusterStatus.INIT)):
3988
- region = config['provider']['region']
3989
-
3990
- # pylint: disable=import-outside-toplevel
3991
- from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
3992
-
3993
- from sky.skylet.providers.oci.query_helper import oci_query_helper
3994
-
3995
- # 0: All terminated successfully, failed count otherwise
3996
- returncode = oci_query_helper.terminate_instances_by_tags(
3997
- {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}, region)
3998
-
3999
- # To avoid undefined local variables error.
4000
- stdout = stderr = ''
4001
3982
  else:
4002
3983
  config['provider']['cache_stopped_nodes'] = not terminate
4003
3984
  with tempfile.NamedTemporaryFile('w',
sky/clouds/oci.py CHANGED
@@ -31,6 +31,7 @@ from sky import status_lib
31
31
  from sky.adaptors import oci as oci_adaptor
32
32
  from sky.clouds import service_catalog
33
33
  from sky.clouds.utils import oci_utils
34
+ from sky.provision.oci.query_utils import query_helper
34
35
  from sky.utils import common_utils
35
36
  from sky.utils import resources_utils
36
37
  from sky.utils import ux_utils
@@ -60,6 +61,9 @@ class OCI(clouds.Cloud):
60
61
  {resources_utils.DiskTier.ULTRA})
61
62
  _BEST_DISK_TIER = resources_utils.DiskTier.HIGH
62
63
 
64
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
65
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
66
+
63
67
  @classmethod
64
68
  def _unsupported_features_for_resources(
65
69
  cls, resources: 'resources_lib.Resources'
@@ -433,7 +437,7 @@ class OCI(clouds.Cloud):
433
437
  return True, None
434
438
  except (oci_adaptor.oci.exceptions.ConfigFileNotFound,
435
439
  oci_adaptor.oci.exceptions.InvalidConfig,
436
- oci_adaptor.service_exception()) as e:
440
+ oci_adaptor.oci.exceptions.ServiceError) as e:
437
441
  return False, (
438
442
  f'OCI credential is not correctly set. '
439
443
  f'Check the credential file at {conf_file}\n'
@@ -597,25 +601,11 @@ class OCI(clouds.Cloud):
597
601
  region: Optional[str], zone: Optional[str],
598
602
  **kwargs) -> List[status_lib.ClusterStatus]:
599
603
  del zone, kwargs # Unused.
600
- # Check the lifecycleState definition from the page
601
- # https://docs.oracle.com/en-us/iaas/api/#/en/iaas/latest/Instance/
602
- status_map = {
603
- 'PROVISIONING': status_lib.ClusterStatus.INIT,
604
- 'STARTING': status_lib.ClusterStatus.INIT,
605
- 'RUNNING': status_lib.ClusterStatus.UP,
606
- 'STOPPING': status_lib.ClusterStatus.STOPPED,
607
- 'STOPPED': status_lib.ClusterStatus.STOPPED,
608
- 'TERMINATED': None,
609
- 'TERMINATING': None,
610
- }
611
-
612
- # pylint: disable=import-outside-toplevel
613
- from sky.skylet.providers.oci.query_helper import oci_query_helper
614
604
 
615
605
  status_list = []
616
606
  try:
617
- vms = oci_query_helper.query_instances_by_tags(
618
- tag_filters=tag_filters, region=region)
607
+ vms = query_helper.query_instances_by_tags(tag_filters=tag_filters,
608
+ region=region)
619
609
  except Exception as e: # pylint: disable=broad-except
620
610
  with ux_utils.print_exception_no_traceback():
621
611
  raise exceptions.ClusterStatusFetchingError(
@@ -625,9 +615,9 @@ class OCI(clouds.Cloud):
625
615
 
626
616
  for node in vms:
627
617
  vm_status = node.lifecycle_state
628
- if vm_status in status_map:
629
- sky_status = status_map[vm_status]
630
- if sky_status is not None:
631
- status_list.append(sky_status)
618
+ sky_status = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY.get(
619
+ vm_status, None)
620
+ if sky_status is not None:
621
+ status_list.append(sky_status)
632
622
 
633
623
  return status_list
@@ -66,7 +66,7 @@ def _get_df() -> 'pd.DataFrame':
66
66
  logger.debug(f'It is OK goes here when testing: {str(e)}')
67
67
  subscribed_regions = []
68
68
 
69
- except oci_adaptor.service_exception() as e:
69
+ except oci_adaptor.oci.exceptions.ServiceError as e:
70
70
  # Should never expect going here. However, we still catch
71
71
  # it so that if any OCI call failed, the program can still
72
72
  # proceed with try-and-error way.
@@ -5,13 +5,14 @@ History:
5
5
  - Hysun He (hysun.he@oracle.com) @ Oct, 2024: Add default image OS
6
6
  configuration.
7
7
  """
8
- import logging
9
8
  import os
10
9
 
10
+ from sky import sky_logging
11
11
  from sky import skypilot_config
12
+ from sky import status_lib
12
13
  from sky.utils import resources_utils
13
14
 
14
- logger = logging.getLogger(__name__)
15
+ logger = sky_logging.init_logger(__name__)
15
16
 
16
17
 
17
18
  class OCIConfig:
@@ -77,6 +78,19 @@ class OCIConfig:
77
78
  resources_utils.DiskTier.HIGH: DISK_TIER_HIGH,
78
79
  }
79
80
 
81
+ # Oracle instance's lifecycle state to sky state mapping.
82
+ # For Oracle VM instance's lifecyle state, please refer to the link:
83
+ # https://docs.oracle.com/en-us/iaas/api/#/en/iaas/latest/Instance/
84
+ STATE_MAPPING_OCI_TO_SKY = {
85
+ 'PROVISIONING': status_lib.ClusterStatus.INIT,
86
+ 'STARTING': status_lib.ClusterStatus.INIT,
87
+ 'RUNNING': status_lib.ClusterStatus.UP,
88
+ 'STOPPING': status_lib.ClusterStatus.STOPPED,
89
+ 'STOPPED': status_lib.ClusterStatus.STOPPED,
90
+ 'TERMINATED': None,
91
+ 'TERMINATING': None,
92
+ }
93
+
80
94
  @classmethod
81
95
  def get_compartment(cls, region):
82
96
  # Allow task(cluster)-specific compartment/VCN parameters.
sky/dag.py CHANGED
@@ -56,21 +56,25 @@ class Dag:
56
56
  return self.graph
57
57
 
58
58
  def is_chain(self) -> bool:
59
- # NOTE: this method assumes that the graph has no cycle.
60
- is_chain = True
61
- visited_zero_out_degree = False
62
- for node in self.graph.nodes:
63
- out_degree = self.graph.out_degree(node)
64
- if out_degree > 1:
65
- is_chain = False
66
- break
67
- elif out_degree == 0:
68
- if visited_zero_out_degree:
69
- is_chain = False
70
- break
71
- else:
72
- visited_zero_out_degree = True
73
- return is_chain
59
+ """Check if the DAG is a linear chain of tasks."""
60
+
61
+ nodes = list(self.graph.nodes)
62
+
63
+ if len(nodes) == 0:
64
+ return True
65
+
66
+ in_degrees = [self.graph.in_degree(node) for node in nodes]
67
+ out_degrees = [self.graph.out_degree(node) for node in nodes]
68
+
69
+ # Check out-degrees: all <= 1 and exactly one node has out_degree == 0
70
+ out_degree_condition = (all(degree <= 1 for degree in out_degrees) and
71
+ sum(degree == 0 for degree in out_degrees) == 1)
72
+
73
+ # Check in-degrees: all <= 1 and exactly one node has in_degree == 0
74
+ in_degree_condition = (all(degree <= 1 for degree in in_degrees) and
75
+ sum(degree == 0 for degree in in_degrees) == 1)
76
+
77
+ return out_degree_condition and in_degree_condition
74
78
 
75
79
 
76
80
  class _DagContext(threading.local):
sky/provision/__init__.py CHANGED
@@ -20,6 +20,7 @@ from sky.provision import fluidstack
20
20
  from sky.provision import gcp
21
21
  from sky.provision import kubernetes
22
22
  from sky.provision import lambda_cloud
23
+ from sky.provision import oci
23
24
  from sky.provision import runpod
24
25
  from sky.provision import vsphere
25
26
  from sky.utils import command_runner
@@ -20,7 +20,7 @@ SETUP_ENV_VARS_CMD = (
20
20
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; } && '
21
21
  'printenv | while IFS=\'=\' read -r key value; do echo "export $key=\\\"$value\\\""; done > ' # pylint: disable=line-too-long
22
22
  '~/container_env_var.sh && '
23
- '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh'
23
+ '$(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh;'
24
24
  )
25
25
 
26
26
  # Docker daemon may not be ready when the machine is firstly started. The error
@@ -333,52 +333,37 @@ def _run_function_with_retries(func: Callable,
333
333
  raise
334
334
 
335
335
 
336
- def _set_env_vars_in_pods(namespace: str, context: Optional[str],
337
- new_pods: List):
338
- """Setting environment variables in pods.
339
-
340
- Once all containers are ready, we can exec into them and set env vars.
341
- Kubernetes automatically populates containers with critical
342
- environment variables, such as those for discovering services running
343
- in the cluster and CUDA/nvidia environment variables. We need to
344
- make sure these env vars are available in every task and ssh session.
345
- This is needed for GPU support and service discovery.
346
- See https://github.com/skypilot-org/skypilot/issues/2287 for
347
- more details.
348
-
349
- To do so, we capture env vars from the pod's runtime and write them to
350
- /etc/profile.d/, making them available for all users in future
351
- shell sessions.
352
- """
353
- set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
336
+ def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
337
+ """Pre-initialization step for SkyPilot pods.
354
338
 
355
- def _set_env_vars_thread(new_pod):
356
- pod_name = new_pod.metadata.name
357
- logger.info(f'{"-"*20}Start: Set up env vars in pod {pod_name!r} '
358
- f'{"-"*20}')
359
- runner = command_runner.KubernetesCommandRunner(
360
- ((namespace, context), pod_name))
339
+ This step is run in the pod right after it is created and before the
340
+ SkyPilot runtime is setup.
361
341
 
362
- def _run_env_vars_cmd():
363
- rc, stdout, _ = runner.run(set_k8s_env_var_cmd,
364
- require_outputs=True,
365
- stream_logs=False)
366
- _raise_command_running_error('set env vars', set_k8s_env_var_cmd,
367
- pod_name, rc, stdout)
342
+ This step includes three key steps:
368
343
 
369
- _run_function_with_retries(_run_env_vars_cmd,
370
- f'set env vars in pod {pod_name}')
371
- logger.info(f'{"-"*20}End: Set up env vars in pod {pod_name!r} '
372
- f'{"-"*20}')
344
+ 1. Privilege check: Checks if the default user has sufficient privilege
345
+ to set up the kubernetes instance pod.
346
+ 2. SSH setup: Sets up SSH for the pod instance.
347
+ 3. Environment variable setup to populate k8s env vars in the pod.
373
348
 
374
- subprocess_utils.run_in_parallel(_set_env_vars_thread, new_pods,
375
- NUM_THREADS)
349
+ Make sure commands used in these methods are generic and work
350
+ on most base images. E.g., do not use Python, since that may not
351
+ be installed by default.
376
352
 
353
+ If you run any apt commands, be sure to check if the lock is available.
354
+ It is possible the `apt update` run in the pod container args may still
355
+ be running.
356
+
357
+ Args:
358
+ namespace (str): Kubernetes namespace.
359
+ context (Optional[str]): Kubernetes context.
360
+ new_nodes (List): List of new pod instances.
361
+
362
+ Raises:
363
+ config_lib.KubernetesError: If user privileges are insufficient or
364
+ setup fails.
365
+ """
377
366
 
378
- def _check_user_privilege(namespace: str, context: Optional[str],
379
- new_nodes: List) -> None:
380
- # Checks if the default user has sufficient privilege to set up
381
- # the kubernetes instance pod.
382
367
  check_k8s_user_sudo_cmd = (
383
368
  'if [ $(id -u) -eq 0 ]; then'
384
369
  # If user is root, create an alias for sudo used in skypilot setup
@@ -386,56 +371,67 @@ def _check_user_privilege(namespace: str, context: Optional[str],
386
371
  'else '
387
372
  ' if command -v sudo >/dev/null 2>&1; then '
388
373
  ' timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || '
389
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
374
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
375
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
390
376
  ' else '
391
- f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; ); '
377
+ f' ( echo {exceptions.INSUFFICIENT_PRIVILEGES_CODE!r}; '
378
+ f' exit {exceptions.INSUFFICIENT_PRIVILEGES_CODE}; ); '
392
379
  ' fi; '
393
- 'fi')
380
+ 'fi;')
381
+
382
+ # Kubernetes automatically populates containers with critical
383
+ # environment variables, such as those for discovering services running
384
+ # in the cluster and CUDA/nvidia environment variables. We need to
385
+ # make sure these env vars are available in every task and ssh session.
386
+ # This is needed for GPU support and service discovery.
387
+ # See https://github.com/skypilot-org/skypilot/issues/2287 for more details.
388
+ # To do so, we capture env vars from the pod's runtime and write them to
389
+ # /etc/profile.d/, making them available for all users in future
390
+ # shell sessions.
391
+ set_k8s_env_var_cmd = docker_utils.SETUP_ENV_VARS_CMD
394
392
 
395
- # This check needs to run on a per-image basis, so running the check on
396
- # any one pod is sufficient.
397
- new_node = new_nodes[0]
398
- pod_name = new_node.metadata.name
393
+ check_apt_update_complete_cmd = (
394
+ 'echo "Checking if apt update from container init is complete..."; '
395
+ 'timeout_secs=600; '
396
+ 'start_time=$(date +%s); '
397
+ 'while ! grep -q "Fetched" /tmp/apt-update.log 2>/dev/null; do '
398
+ ' echo "apt update still running. Logs:"; '
399
+ ' cat /tmp/apt-update.log; '
400
+ ' current_time=$(date +%s); '
401
+ ' elapsed=$((current_time - start_time)); '
402
+ ' if [ $elapsed -ge $timeout_secs ]; then '
403
+ ' echo "Timed out waiting for apt update"; '
404
+ ' exit 1; '
405
+ ' fi; '
406
+ ' sleep 5; '
407
+ 'done; '
408
+ 'echo "apt update complete."; ')
399
409
 
400
- runner = command_runner.KubernetesCommandRunner(
401
- ((namespace, context), pod_name))
402
- logger.info(f'{"-"*20}Start: Check user privilege in pod {pod_name!r} '
403
- f'{"-"*20}')
404
-
405
- def _run_privilege_check():
406
- rc, stdout, stderr = runner.run(check_k8s_user_sudo_cmd,
407
- require_outputs=True,
408
- separate_stderr=True,
409
- stream_logs=False)
410
- _raise_command_running_error('check user privilege',
411
- check_k8s_user_sudo_cmd, pod_name, rc,
412
- stdout + stderr)
413
- return stdout
414
-
415
- stdout = _run_function_with_retries(
416
- _run_privilege_check, f'check user privilege in pod {pod_name!r}')
417
-
418
- if stdout == str(exceptions.INSUFFICIENT_PRIVILEGES_CODE):
419
- raise config_lib.KubernetesError(
420
- 'Insufficient system privileges detected. '
421
- 'Ensure the default user has root access or '
422
- '"sudo" is installed and the user is added to the sudoers '
423
- 'from the image.')
424
- logger.info(f'{"-"*20}End: Check user privilege in pod {pod_name!r} '
425
- f'{"-"*20}')
426
-
427
-
428
- def _setup_ssh_in_pods(namespace: str, context: Optional[str],
429
- new_nodes: List) -> None:
430
- # Setting up ssh for the pod instance. This is already setup for
431
- # the jump pod so it does not need to be run for it.
432
- set_k8s_ssh_cmd = (
433
- 'set -ex; '
410
+ install_ssh_k8s_cmd = (
434
411
  'prefix_cmd() '
435
412
  '{ if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }; '
436
413
  'export DEBIAN_FRONTEND=noninteractive;'
437
- '$(prefix_cmd) apt-get update;'
438
- '$(prefix_cmd) apt install openssh-server rsync -y; '
414
+ 'echo "Installing missing packages..."; '
415
+ 'for i in {1..5}; do '
416
+ ' output=$($(prefix_cmd) apt install openssh-server rsync -y 2>&1); '
417
+ ' rc=$?; '
418
+ ' if [ $rc -eq 0 ]; then '
419
+ ' break; '
420
+ ' fi; '
421
+ ' echo "$output" | grep -qi "could not get lock" || '
422
+ ' grep -qi "Unable to acquire the dpkg frontend lock"; '
423
+ ' if [ $? -eq 0 ]; then '
424
+ ' echo "apt install failed due to lock, retrying. (Attempt $i/5)"; '
425
+ ' sleep 5; '
426
+ ' else '
427
+ ' echo "apt install failed for a non-lock reason: $output"; '
428
+ ' exit $rc; '
429
+ ' fi; '
430
+ 'done; '
431
+ 'if [ $rc -ne 0 ]; then '
432
+ ' echo "apt install failed after 5 attempts due to lock errors."; '
433
+ ' exit $rc; '
434
+ 'fi; '
439
435
  '$(prefix_cmd) mkdir -p /var/run/sshd; '
440
436
  '$(prefix_cmd) '
441
437
  'sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" '
@@ -456,24 +452,35 @@ def _setup_ssh_in_pods(namespace: str, context: Optional[str],
456
452
  # See https://www.educative.io/answers/error-mesg-ttyname-failed-inappropriate-ioctl-for-device # pylint: disable=line-too-long
457
453
  '$(prefix_cmd) sed -i "s/mesg n/tty -s \\&\\& mesg n/" ~/.profile;')
458
454
 
459
- def _setup_ssh_thread(new_node):
455
+ pre_init_cmd = ('set -ex; ' + check_k8s_user_sudo_cmd +
456
+ set_k8s_env_var_cmd + check_apt_update_complete_cmd +
457
+ install_ssh_k8s_cmd)
458
+
459
+ def _pre_init_thread(new_node):
460
460
  pod_name = new_node.metadata.name
461
+ logger.info(f'{"-"*20}Start: Pre-init in pod {pod_name!r} {"-"*20}')
461
462
  runner = command_runner.KubernetesCommandRunner(
462
463
  ((namespace, context), pod_name))
463
- logger.info(f'{"-"*20}Start: Set up SSH in pod {pod_name!r} {"-"*20}')
464
464
 
465
- def _run_ssh_setup():
466
- rc, stdout, _ = runner.run(set_k8s_ssh_cmd,
467
- require_outputs=True,
468
- stream_logs=False)
469
- _raise_command_running_error('setup ssh', set_k8s_ssh_cmd, pod_name,
470
- rc, stdout)
465
+ # Run the combined pre-init command
466
+ rc, stdout, _ = runner.run(pre_init_cmd,
467
+ require_outputs=True,
468
+ stream_logs=False)
469
+ if rc == exceptions.INSUFFICIENT_PRIVILEGES_CODE:
470
+ raise config_lib.KubernetesError(
471
+ 'Insufficient system privileges detected. '
472
+ 'Ensure the default user has root access or '
473
+ '"sudo" is installed and the user is added to the sudoers '
474
+ 'from the image.')
475
+
476
+ op_name = 'pre-init'
477
+ _raise_command_running_error(op_name, pre_init_cmd, pod_name, rc,
478
+ stdout)
471
479
 
472
- _run_function_with_retries(_run_ssh_setup,
473
- f'setup ssh in pod {pod_name!r}')
474
- logger.info(f'{"-"*20}End: Set up SSH in pod {pod_name!r} {"-"*20}')
480
+ logger.info(f'{"-"*20}End: Pre-init in pod {pod_name!r} {"-"*20}')
475
481
 
476
- subprocess_utils.run_in_parallel(_setup_ssh_thread, new_nodes, NUM_THREADS)
482
+ # Run pre_init in parallel across all new_nodes
483
+ subprocess_utils.run_in_parallel(_pre_init_thread, new_nodes, NUM_THREADS)
477
484
 
478
485
 
479
486
  def _label_pod(namespace: str, context: Optional[str], pod_name: str,
@@ -724,13 +731,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
724
731
  f'pods: {list(uninitialized_pods.keys())}')
725
732
  uninitialized_pods_list = list(uninitialized_pods.values())
726
733
 
727
- # Setup SSH and environment variables in pods.
728
- # Make sure commands used in these methods are generic and work
729
- # on most base images. E.g., do not use Python, since that may not
730
- # be installed by default.
731
- _check_user_privilege(namespace, context, uninitialized_pods_list)
732
- _setup_ssh_in_pods(namespace, context, uninitialized_pods_list)
733
- _set_env_vars_in_pods(namespace, context, uninitialized_pods_list)
734
+ # Run pre-init steps in the pod.
735
+ pre_init(namespace, context, uninitialized_pods_list)
734
736
 
735
737
  for pod in uninitialized_pods.values():
736
738
  _label_pod(namespace,
@@ -0,0 +1,15 @@
1
+ """OCI provisioner for SkyPilot.
2
+
3
+ History:
4
+ - Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
5
+ """
6
+
7
+ from sky.provision.oci.config import bootstrap_instances
8
+ from sky.provision.oci.instance import cleanup_ports
9
+ from sky.provision.oci.instance import get_cluster_info
10
+ from sky.provision.oci.instance import open_ports
11
+ from sky.provision.oci.instance import query_instances
12
+ from sky.provision.oci.instance import run_instances
13
+ from sky.provision.oci.instance import stop_instances
14
+ from sky.provision.oci.instance import terminate_instances
15
+ from sky.provision.oci.instance import wait_instances
@@ -0,0 +1,51 @@
1
+ """OCI configuration bootstrapping.
2
+
3
+ Creates the resource group and deploys the configuration template to OCI for
4
+ a cluster to be launched.
5
+
6
+ History:
7
+ - Hysun He (hysun.he@oracle.com) @ Oct.16, 2024: Initial implementation
8
+ """
9
+
10
+ from sky import exceptions
11
+ from sky import sky_logging
12
+ from sky.adaptors import oci as oci_adaptor
13
+ from sky.clouds.utils import oci_utils
14
+ from sky.provision import common
15
+ from sky.provision.oci.query_utils import query_helper
16
+
17
+ logger = sky_logging.init_logger(__name__)
18
+
19
+
20
+ @common.log_function_start_end
21
+ def bootstrap_instances(
22
+ region: str, cluster_name_on_cloud: str,
23
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
24
+ """See sky/provision/__init__.py"""
25
+ # OCI module import and oci client
26
+ oci_adaptor.get_core_client(region, oci_utils.oci_config.get_profile())
27
+
28
+ # Find / create a compartment for creating instances.
29
+ compartment = query_helper.find_compartment(region)
30
+
31
+ # Find the configured VCN, or create a new one.
32
+ vcn = query_helper.find_create_vcn_subnet(region)
33
+ if vcn is None:
34
+ # pylint: disable=line-too-long
35
+ raise exceptions.ResourcesUnavailableError(
36
+ 'Failed to create a new VCN, possibly you hit the resource limitation.'
37
+ )
38
+
39
+ node_config = config.node_config
40
+
41
+ # Subscribe the image if it is from Marketplace listing.
42
+ query_helper.subscribe_image(
43
+ compartment_id=compartment,
44
+ listing_id=node_config['AppCatalogListingId'],
45
+ resource_version=node_config['ResourceVersion'],
46
+ region=region,
47
+ )
48
+
49
+ logger.info(f'Using cluster name: {cluster_name_on_cloud}')
50
+
51
+ return config