skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend.py +10 -0
  3. sky/backends/backend_utils.py +207 -79
  4. sky/backends/cloud_vm_ray_backend.py +37 -13
  5. sky/backends/local_docker_backend.py +9 -0
  6. sky/client/cli/command.py +112 -53
  7. sky/client/common.py +4 -2
  8. sky/client/sdk.py +17 -7
  9. sky/client/sdk_async.py +4 -2
  10. sky/clouds/kubernetes.py +2 -1
  11. sky/clouds/runpod.py +20 -7
  12. sky/core.py +9 -54
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
  15. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
  22. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  23. sky/dashboard/out/clusters/[cluster].html +1 -1
  24. sky/dashboard/out/clusters.html +1 -1
  25. sky/dashboard/out/config.html +1 -1
  26. sky/dashboard/out/index.html +1 -1
  27. sky/dashboard/out/infra/[context].html +1 -1
  28. sky/dashboard/out/infra.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  31. sky/dashboard/out/jobs.html +1 -1
  32. sky/dashboard/out/users.html +1 -1
  33. sky/dashboard/out/volumes.html +1 -1
  34. sky/dashboard/out/workspace/new.html +1 -1
  35. sky/dashboard/out/workspaces/[name].html +1 -1
  36. sky/dashboard/out/workspaces.html +1 -1
  37. sky/data/mounting_utils.py +19 -10
  38. sky/execution.py +4 -2
  39. sky/global_user_state.py +271 -67
  40. sky/jobs/client/sdk.py +10 -1
  41. sky/jobs/constants.py +2 -0
  42. sky/jobs/controller.py +11 -7
  43. sky/jobs/server/core.py +5 -3
  44. sky/jobs/server/server.py +15 -11
  45. sky/jobs/utils.py +1 -1
  46. sky/logs/agent.py +30 -3
  47. sky/logs/aws.py +9 -19
  48. sky/provision/__init__.py +2 -1
  49. sky/provision/aws/instance.py +2 -1
  50. sky/provision/azure/instance.py +2 -1
  51. sky/provision/cudo/instance.py +2 -2
  52. sky/provision/do/instance.py +2 -2
  53. sky/provision/docker_utils.py +41 -19
  54. sky/provision/fluidstack/instance.py +2 -2
  55. sky/provision/gcp/instance.py +2 -1
  56. sky/provision/hyperbolic/instance.py +2 -1
  57. sky/provision/instance_setup.py +1 -1
  58. sky/provision/kubernetes/instance.py +134 -8
  59. sky/provision/lambda_cloud/instance.py +2 -1
  60. sky/provision/nebius/instance.py +2 -1
  61. sky/provision/oci/instance.py +2 -1
  62. sky/provision/paperspace/instance.py +2 -2
  63. sky/provision/primeintellect/instance.py +2 -2
  64. sky/provision/provisioner.py +1 -0
  65. sky/provision/runpod/__init__.py +2 -0
  66. sky/provision/runpod/instance.py +2 -2
  67. sky/provision/scp/instance.py +2 -2
  68. sky/provision/seeweb/instance.py +2 -1
  69. sky/provision/vast/instance.py +2 -1
  70. sky/provision/vsphere/instance.py +6 -5
  71. sky/schemas/api/responses.py +2 -1
  72. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  73. sky/serve/autoscalers.py +2 -0
  74. sky/serve/client/impl.py +45 -19
  75. sky/serve/replica_managers.py +12 -5
  76. sky/serve/serve_utils.py +5 -7
  77. sky/serve/server/core.py +9 -6
  78. sky/serve/server/impl.py +78 -25
  79. sky/serve/server/server.py +4 -5
  80. sky/serve/service_spec.py +33 -0
  81. sky/server/constants.py +1 -1
  82. sky/server/daemons.py +2 -3
  83. sky/server/requests/executor.py +56 -6
  84. sky/server/requests/payloads.py +32 -8
  85. sky/server/requests/preconditions.py +2 -3
  86. sky/server/rest.py +2 -0
  87. sky/server/server.py +28 -19
  88. sky/server/stream_utils.py +34 -12
  89. sky/setup_files/dependencies.py +5 -2
  90. sky/setup_files/setup.py +44 -44
  91. sky/skylet/constants.py +4 -1
  92. sky/skylet/events.py +42 -0
  93. sky/templates/jobs-controller.yaml.j2 +3 -0
  94. sky/templates/kubernetes-ray.yml.j2 +24 -18
  95. sky/usage/usage_lib.py +3 -0
  96. sky/utils/cli_utils/status_utils.py +4 -5
  97. sky/utils/context.py +104 -29
  98. sky/utils/controller_utils.py +7 -6
  99. sky/utils/db/db_utils.py +5 -1
  100. sky/utils/db/migration_utils.py +1 -1
  101. sky/utils/kubernetes/create_cluster.sh +13 -28
  102. sky/utils/kubernetes/delete_cluster.sh +10 -7
  103. sky/utils/kubernetes/generate_kind_config.py +6 -66
  104. sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
  105. sky/utils/kubernetes_enums.py +5 -0
  106. sky/utils/ux_utils.py +35 -1
  107. sky/utils/yaml_utils.py +9 -0
  108. sky/volumes/client/sdk.py +44 -8
  109. sky/volumes/server/core.py +1 -0
  110. sky/volumes/server/server.py +33 -7
  111. sky/volumes/volume.py +35 -28
  112. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
  113. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
  114. sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
  119. /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
  120. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
  121. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
  122. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
  123. {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import asyncio
4
4
  import logging
5
5
  import os
6
+ import pathlib
6
7
  import resource
7
8
  import shutil
8
9
  import sys
@@ -17,6 +18,7 @@ import sky
17
18
  from sky import core
18
19
  from sky import exceptions
19
20
  from sky import sky_logging
21
+ from sky import skypilot_config
20
22
  from sky.backends import backend_utils
21
23
  from sky.backends import cloud_vm_ray_backend
22
24
  from sky.data import data_utils
@@ -56,6 +58,7 @@ async def create_background_task(coro: typing.Coroutine) -> None:
56
58
  async with _background_tasks_lock:
57
59
  task = asyncio.create_task(coro)
58
60
  _background_tasks.add(task)
61
+ # TODO(cooperc): Discard needs a lock?
59
62
  task.add_done_callback(_background_tasks.discard)
60
63
 
61
64
 
@@ -896,6 +899,9 @@ class Controller:
896
899
  # some data here.
897
900
  raise error
898
901
 
902
+ # Use context.contextual to enable per-job output redirection and env var
903
+ # isolation.
904
+ @context.contextual
899
905
  async def run_job_loop(self,
900
906
  job_id: int,
901
907
  dag_yaml: str,
@@ -904,13 +910,9 @@ class Controller:
904
910
  env_file_path: Optional[str] = None,
905
911
  pool: Optional[str] = None):
906
912
  """Background task that runs the job loop."""
907
- # Replace os.environ with ContextualEnviron to enable per-job
908
- # environment isolation. This allows each job to have its own
909
- # environment variables without affecting other jobs or the main
910
- # process.
911
- context.initialize()
912
913
  ctx = context.get()
913
- ctx.redirect_log(log_file) # type: ignore
914
+ assert ctx is not None, 'Context is not initialized'
915
+ ctx.redirect_log(pathlib.Path(log_file))
914
916
 
915
917
  # Load and apply environment variables from the job's environment file
916
918
  if env_file_path and os.path.exists(env_file_path):
@@ -921,13 +923,15 @@ class Controller:
921
923
  f'{list(env_vars.keys())}')
922
924
 
923
925
  # Apply environment variables to the job's context
924
- ctx = context.get()
925
926
  if ctx is not None:
926
927
  for key, value in env_vars.items():
927
928
  if value is not None:
928
929
  ctx.override_envs({key: value})
929
930
  job_logger.debug(
930
931
  f'Set environment variable: {key}={value}')
932
+ # Reload the skypilot config for this context to make sure
933
+ # the latest config is used.
934
+ skypilot_config.reload_config()
931
935
  else:
932
936
  job_logger.error(
933
937
  'Context is None, cannot set environment variables')
sky/jobs/server/core.py CHANGED
@@ -281,8 +281,7 @@ def launch(
281
281
  # Check whether cached jobs controller cluster is accessible
282
282
  cluster_name = (
283
283
  controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
284
- record = global_user_state.get_cluster_from_name(cluster_name)
285
- if record is not None:
284
+ if global_user_state.cluster_with_name_exists(cluster_name):
286
285
  # there is a cached jobs controller cluster
287
286
  try:
288
287
  # TODO: do something with returned status?
@@ -369,6 +368,8 @@ def launch(
369
368
  'priority': priority,
370
369
  'consolidation_mode_job_id': consolidation_mode_job_id,
371
370
  'pool': pool,
371
+ 'job_controller_indicator_file':
372
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
372
373
  **controller_utils.shared_controller_vars_to_fill(
373
374
  controller,
374
375
  remote_user_config_path=remote_user_config_path,
@@ -959,9 +960,10 @@ def pool_apply(
959
960
  task: 'sky.Task',
960
961
  pool_name: str,
961
962
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
963
+ workers: Optional[int] = None,
962
964
  ) -> None:
963
965
  """Apply a config to a pool."""
964
- return impl.apply(task, pool_name, mode, pool=True)
966
+ return impl.apply(task, workers, pool_name, mode, pool=True)
965
967
 
966
968
 
967
969
  @usage_lib.entrypoint
sky/jobs/server/server.py CHANGED
@@ -94,23 +94,27 @@ async def logs(
94
94
  request: fastapi.Request, jobs_logs_body: payloads.JobsLogsBody,
95
95
  background_tasks: fastapi.BackgroundTasks
96
96
  ) -> fastapi.responses.StreamingResponse:
97
- executor.schedule_request(
97
+ schedule_type = api_requests.ScheduleType.SHORT
98
+ if jobs_logs_body.refresh:
99
+ # When refresh is specified, the job controller might be restarted,
100
+ # which takes longer time to finish. We schedule it to long executor.
101
+ schedule_type = api_requests.ScheduleType.LONG
102
+ request_task = executor.prepare_request(
98
103
  request_id=request.state.request_id,
99
104
  request_name='jobs.logs',
100
105
  request_body=jobs_logs_body,
101
106
  func=core.tail_logs,
102
- # TODO(aylei): We have tail logs scheduled as SHORT request, because it
103
- # should be responsive. However, it can be long running if the user's
104
- # job keeps running, and we should avoid it taking the SHORT worker
105
- # indefinitely.
106
- # When refresh is True we schedule it as LONG because a controller
107
- # restart might be needed.
108
- schedule_type=api_requests.ScheduleType.LONG
109
- if jobs_logs_body.refresh else api_requests.ScheduleType.SHORT,
107
+ schedule_type=schedule_type,
110
108
  request_cluster_name=common.JOB_CONTROLLER_NAME,
111
109
  )
112
- request_task = await api_requests.get_request_async(request.state.request_id
113
- )
110
+ if schedule_type == api_requests.ScheduleType.LONG:
111
+ executor.schedule_prepared_request(request_task)
112
+ else:
113
+ # For short request, run in the coroutine to avoid blocking
114
+ # short workers.
115
+ task = executor.execute_request_in_coroutine(request_task)
116
+ # Cancel the coroutine after the request is done or client disconnects
117
+ background_tasks.add_task(task.cancel)
114
118
 
115
119
  return stream_utils.stream_response(
116
120
  request_id=request_task.request_id,
sky/jobs/utils.py CHANGED
@@ -156,7 +156,7 @@ def _validate_consolidation_mode_config(
156
156
  if current_is_consolidation_mode:
157
157
  controller_cn = (
158
158
  controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
159
- if global_user_state.get_cluster_from_name(controller_cn) is not None:
159
+ if global_user_state.cluster_with_name_exists(controller_cn):
160
160
  with ux_utils.print_exception_no_traceback():
161
161
  raise exceptions.InconsistentConsolidationModeError(
162
162
  f'{colorama.Fore.RED}Consolidation mode for jobs is '
sky/logs/agent.py CHANGED
@@ -34,7 +34,8 @@ class FluentbitAgent(LoggingAgent):
34
34
  def get_setup_command(self,
35
35
  cluster_name: resources_utils.ClusterName) -> str:
36
36
  install_cmd = (
37
- 'if ! command -v fluent-bit >/dev/null 2>&1; then '
37
+ # pylint: disable=line-too-long
38
+ 'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
38
39
  'sudo apt-get update; sudo apt-get install -y gnupg; '
39
40
  # pylint: disable=line-too-long
40
41
  'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
@@ -51,14 +52,32 @@ class FluentbitAgent(LoggingAgent):
51
52
  cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
52
53
  config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
53
54
  f'echo {shlex.quote(cfg)} > {cfg_path}')
55
+ kill_prior_cmd = (
56
+ 'if [ -f "/tmp/fluentbit.pid" ]; then '
57
+ # pylint: disable=line-too-long
58
+ 'echo "Killing prior fluent-bit process $(cat /tmp/fluentbit.pid)"; '
59
+ 'kill "$(cat /tmp/fluentbit.pid)" || true; '
60
+ 'fi')
54
61
  start_cmd = ('nohup $(command -v fluent-bit || '
55
62
  'echo "/opt/fluent-bit/bin/fluent-bit") '
56
- f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &')
57
- return f'set -e; {install_cmd}; {config_cmd}; {start_cmd}'
63
+ f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 & '
64
+ 'echo $! > /tmp/fluentbit.pid')
65
+ return ('set -e; '
66
+ f'{install_cmd}; '
67
+ f'{config_cmd}; '
68
+ f'{kill_prior_cmd}; '
69
+ f'{start_cmd}')
58
70
 
59
71
  def fluentbit_config(self,
60
72
  cluster_name: resources_utils.ClusterName) -> str:
61
73
  cfg_dict = {
74
+ 'parsers': [{
75
+ 'name': 'sky-ray-parser',
76
+ 'format': 'regex',
77
+ # pylint: disable=line-too-long
78
+ 'regex': r'(?:\x1b\[[\d;]+m)?\((?<worker_name>[^,]+)(?:,\s*rank=(?<rank>\d+))?(?:,\s*pid=(?<pid>\d+))(?:,\s*ip=(?<ip>[\d.]+))?\)(?:\x1b\[[\d;]+m)?\s*(?<log_line>.*)',
79
+ 'types': 'rank:integer pid:integer',
80
+ }],
62
81
  'pipeline': {
63
82
  'inputs': [{
64
83
  'name': 'tail',
@@ -70,6 +89,14 @@ class FluentbitAgent(LoggingAgent):
70
89
  # right after the job completion.
71
90
  'refresh_interval': 1,
72
91
  }],
92
+ 'filters': [{
93
+ 'name': 'parser',
94
+ 'match': '*',
95
+ 'key_name': 'log',
96
+ 'parser': 'sky-ray-parser',
97
+ 'preserve_key': 'on', # preserve field for backwards compat
98
+ 'reserve_data': 'on',
99
+ }],
73
100
  'outputs': [self.fluentbit_output_config(cluster_name)],
74
101
  }
75
102
  }
sky/logs/aws.py CHANGED
@@ -5,7 +5,6 @@ from typing import Any, Dict, Optional
5
5
  import pydantic
6
6
 
7
7
  from sky.logs.agent import FluentbitAgent
8
- from sky.skylet import constants
9
8
  from sky.utils import resources_utils
10
9
  from sky.utils import yaml_utils
11
10
 
@@ -176,6 +175,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
176
175
  Returns:
177
176
  The Fluent Bit configuration as a YAML string.
178
177
  """
178
+ cfg_dict = yaml_utils.read_yaml_str(
179
+ super().fluentbit_config(cluster_name))
179
180
  display_name = cluster_name.display_name
180
181
  unique_name = cluster_name.name_on_cloud
181
182
  # Build tags for the log stream
@@ -197,24 +198,13 @@ class CloudwatchLoggingAgent(FluentbitAgent):
197
198
  'value': value
198
199
  })
199
200
 
200
- cfg_dict = {
201
- 'pipeline': {
202
- 'inputs': [{
203
- 'name': 'tail',
204
- 'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
205
- 'path_key': 'log_path',
206
- # Shorten the refresh interval from 60s to 1s since every
207
- # job creates a new log file and we must be responsive
208
- # for this: the VM might be autodown within a minute
209
- # right after the job completion.
210
- 'refresh_interval': 1,
211
- 'processors': {
212
- 'logs': log_processors,
213
- }
214
- }],
215
- 'outputs': [self.fluentbit_output_config(cluster_name)],
216
- }
217
- }
201
+ # Add log processors to config
202
+ processors_config = cfg_dict['pipeline']['inputs'][0].get(
203
+ 'processors', {})
204
+ processors_logs_config = processors_config.get('logs', [])
205
+ processors_logs_config.extend(log_processors)
206
+ processors_config['logs'] = processors_logs_config
207
+ cfg_dict['pipeline']['inputs'][0]['processors'] = processors_config
218
208
 
219
209
  return yaml_utils.dump_yaml_str(cfg_dict)
220
210
 
sky/provision/__init__.py CHANGED
@@ -168,7 +168,8 @@ def map_all_volumes_usedby(
168
168
 
169
169
 
170
170
  @_route_to_cloud_impl
171
- def run_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
171
+ def run_instances(provider_name: str, region: str, cluster_name: str,
172
+ cluster_name_on_cloud: str,
172
173
  config: common.ProvisionConfig) -> common.ProvisionRecord:
173
174
  """Start instances with bootstrapped configuration."""
174
175
  raise NotImplementedError
@@ -311,9 +311,10 @@ def _get_head_instance_id(instances: List) -> Optional[str]:
311
311
  return head_instance_id
312
312
 
313
313
 
314
- def run_instances(region: str, cluster_name_on_cloud: str,
314
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
315
315
  config: common.ProvisionConfig) -> common.ProvisionRecord:
316
316
  """See sky/provision/__init__.py"""
317
+ del cluster_name # unused
317
318
  ec2 = _default_ec2_resource(region)
318
319
  # NOTE: We set max_attempts=0 for fast failing when the resource is not
319
320
  # available (although the doc says it will only retry for network
@@ -362,9 +362,10 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
362
362
  return instances
363
363
 
364
364
 
365
- def run_instances(region: str, cluster_name_on_cloud: str,
365
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
366
366
  config: common.ProvisionConfig) -> common.ProvisionRecord:
367
367
  """See sky/provision/__init__.py"""
368
+ del cluster_name # unused
368
369
  # TODO(zhwu): This function is too long. We should refactor it.
369
370
  provider_config = config.provider_config
370
371
  resource_group = provider_config['resource_group']
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
40
40
  return head_instance_id
41
41
 
42
42
 
43
- def run_instances(region: str, cluster_name_on_cloud: str,
43
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
44
44
  config: common.ProvisionConfig) -> common.ProvisionRecord:
45
45
  """Runs instances for the given cluster."""
46
-
46
+ del cluster_name # unused
47
47
  pending_status = ['pend', 'init', 'prol', 'boot']
48
48
 
49
49
  while True:
@@ -26,10 +26,10 @@ def _get_head_instance(
26
26
  return None
27
27
 
28
28
 
29
- def run_instances(region: str, cluster_name_on_cloud: str,
29
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
30
30
  config: common.ProvisionConfig) -> common.ProvisionRecord:
31
31
  """Runs instances for the given cluster."""
32
-
32
+ del cluster_name # unused
33
33
  pending_status = ['new']
34
34
  newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
35
35
  pending_status + ['off'])
@@ -3,7 +3,7 @@
3
3
  import dataclasses
4
4
  import shlex
5
5
  import time
6
- from typing import Any, Dict, List
6
+ from typing import Any, Dict, List, Optional
7
7
 
8
8
  from sky import sky_logging
9
9
  from sky.skylet import constants
@@ -192,12 +192,16 @@ class DockerInitializer:
192
192
  self.docker_cmd = 'podman' if use_podman else 'docker'
193
193
  self.log_path = log_path
194
194
 
195
- def _run(self,
196
- cmd,
197
- run_env='host',
198
- wait_for_docker_daemon: bool = False,
199
- separate_stderr: bool = False,
200
- log_err_when_fail: bool = True) -> str:
195
+ def _run(
196
+ self,
197
+ cmd,
198
+ run_env='host',
199
+ wait_for_docker_daemon: bool = False,
200
+ separate_stderr: bool = False,
201
+ log_err_when_fail: bool = True,
202
+ flock_name: Optional[str] = None,
203
+ flock_args: Optional[str] = None,
204
+ ) -> str:
201
205
 
202
206
  if run_env == 'docker':
203
207
  cmd = self._docker_expand_user(cmd, any_char=True)
@@ -206,8 +210,13 @@ class DockerInitializer:
206
210
  # an error: `the input device is not a TTY`, and it works without
207
211
  # `-it` flag.
208
212
  # TODO(zhwu): ray use the `-it` flag, we need to check why.
209
- cmd = (f'{self.docker_cmd} exec {self.container_name} /bin/bash -c'
210
- f' {shlex.quote(cmd)} ')
213
+ cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
214
+ f' /bin/bash -c {shlex.quote(cmd)} ')
215
+
216
+ if flock_name is not None:
217
+ flock_args = flock_args or ''
218
+ cmd = (f'flock {flock_args} /tmp/{flock_name} '
219
+ f'-c {shlex.quote(cmd)}')
211
220
 
212
221
  logger.debug(f'+ {cmd}')
213
222
  start = time.time()
@@ -259,7 +268,10 @@ class DockerInitializer:
259
268
  if self._check_container_exited():
260
269
  self.initialized = True
261
270
  self._run(f'{self.docker_cmd} start {self.container_name}')
262
- self._run('sudo service ssh start', run_env='docker')
271
+ self._run('sudo service ssh start',
272
+ run_env='docker',
273
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
274
+ flock_args='-s -w 1')
263
275
  return self._run('whoami', run_env='docker')
264
276
 
265
277
  # SkyPilot: Docker login if user specified a private docker registry.
@@ -358,7 +370,9 @@ class DockerInitializer:
358
370
  self._auto_configure_shm(user_docker_run_options)),
359
371
  self.docker_cmd,
360
372
  )
361
- self._run(f'{remove_container_cmd}; {start_command}')
373
+ self._run(f'{remove_container_cmd} && {start_command}',
374
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
375
+ flock_args='-x -w 10')
362
376
 
363
377
  # SkyPilot: Setup Commands.
364
378
  # TODO(zhwu): the following setups should be aligned with the kubernetes
@@ -376,14 +390,18 @@ class DockerInitializer:
376
390
  'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
377
391
  run_env='docker')
378
392
  # Install dependencies.
379
- self._run(
380
- 'sudo apt-get update; '
393
+ cmd = (
394
+ 'bash -lc \''
395
+ 'exec 200>/var/tmp/sky_apt.lock; '
396
+ 'flock -x -w 120 200 || exit 1; '
397
+ 'export DEBIAN_FRONTEND=noninteractive; '
398
+ 'apt-get -yq update && '
381
399
  # Our mount script will install gcsfuse without fuse package.
382
400
  # We need to install fuse package first to enable storage mount.
383
401
  # The dpkg option is to suppress the prompt for fuse installation.
384
- 'sudo apt-get -o DPkg::Options::="--force-confnew" install -y '
385
- 'rsync curl wget patch openssh-server python3-pip fuse;',
386
- run_env='docker')
402
+ 'apt-get -o DPkg::Options::=--force-confnew install -y '
403
+ 'rsync curl wget patch openssh-server python3-pip fuse\'')
404
+ self._run(cmd, run_env='docker')
387
405
 
388
406
  # Copy local authorized_keys to docker container.
389
407
  # Stop and disable jupyter service. This is to avoid port conflict on
@@ -459,9 +477,13 @@ class DockerInitializer:
459
477
  user_pos = string.find('~')
460
478
  if user_pos > -1:
461
479
  if self.home_dir is None:
462
- cmd = (f'{self.docker_cmd} exec {self.container_name} '
463
- 'printenv HOME')
464
- self.home_dir = self._run(cmd, separate_stderr=True)
480
+ cmd = (f'{self.docker_cmd} exec {self.container_name}'
481
+ ' printenv HOME')
482
+ self.home_dir = self._run(
483
+ cmd,
484
+ separate_stderr=True,
485
+ flock_name=f'{self.container_name}.sky.lifecycle.lock',
486
+ flock_args='-s -w 1')
465
487
  # Check for unexpected newline in home directory, which can be
466
488
  # a common issue when the output is mixed with stderr.
467
489
  assert '\n' not in self.home_dir, (
@@ -78,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
78
78
  return head_instance_id
79
79
 
80
80
 
81
- def run_instances(region: str, cluster_name_on_cloud: str,
81
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
82
82
  config: common.ProvisionConfig) -> common.ProvisionRecord:
83
83
  """Runs instances for the given cluster."""
84
-
84
+ del cluster_name # unused
85
85
  pending_status = ['pending', 'provisioning']
86
86
  while True:
87
87
  instances = _filter_instances(cluster_name_on_cloud, pending_status)
@@ -360,9 +360,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
360
360
  created_instance_ids=created_instance_ids)
361
361
 
362
362
 
363
- def run_instances(region: str, cluster_name_on_cloud: str,
363
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
364
364
  config: common.ProvisionConfig) -> common.ProvisionRecord:
365
365
  """See sky/provision/__init__.py"""
366
+ del cluster_name # unused
366
367
  try:
367
368
  return _run_instances(region, cluster_name_on_cloud, config)
368
369
  except gcp.http_error_exception() as e:
@@ -64,8 +64,9 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
64
64
  return next(iter(instances.keys()))
65
65
 
66
66
 
67
- def run_instances(region: str, cluster_name_on_cloud: str,
67
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
68
68
  config: common.ProvisionConfig) -> common.ProvisionRecord:
69
+ del cluster_name # unused
69
70
  logger.info(f'Starting run_instances with region={region}, '
70
71
  f'cluster={cluster_name_on_cloud}')
71
72
  logger.debug(f'Config: {config}')
@@ -84,7 +84,7 @@ def _set_usage_run_id_cmd() -> str:
84
84
  latest one when the function is called.
85
85
  """
86
86
  return (
87
- f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
87
+ f'cat {usage_constants.USAGE_RUN_ID_FILE} 2> /dev/null || '
88
88
  # The run id is retrieved locally for the current run, so that the
89
89
  # remote cluster will be set with the same run id as the initial
90
90
  # launch operation.