skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +207 -79
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +112 -53
- sky/client/common.py +4 -2
- sky/client/sdk.py +17 -7
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +9 -54
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +271 -67
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +11 -7
- sky/jobs/server/core.py +5 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -7
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +32 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +5 -2
- sky/setup_files/setup.py +44 -44
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +24 -18
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/core.py +1 -0
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +35 -28
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import pathlib
|
|
6
7
|
import resource
|
|
7
8
|
import shutil
|
|
8
9
|
import sys
|
|
@@ -17,6 +18,7 @@ import sky
|
|
|
17
18
|
from sky import core
|
|
18
19
|
from sky import exceptions
|
|
19
20
|
from sky import sky_logging
|
|
21
|
+
from sky import skypilot_config
|
|
20
22
|
from sky.backends import backend_utils
|
|
21
23
|
from sky.backends import cloud_vm_ray_backend
|
|
22
24
|
from sky.data import data_utils
|
|
@@ -56,6 +58,7 @@ async def create_background_task(coro: typing.Coroutine) -> None:
|
|
|
56
58
|
async with _background_tasks_lock:
|
|
57
59
|
task = asyncio.create_task(coro)
|
|
58
60
|
_background_tasks.add(task)
|
|
61
|
+
# TODO(cooperc): Discard needs a lock?
|
|
59
62
|
task.add_done_callback(_background_tasks.discard)
|
|
60
63
|
|
|
61
64
|
|
|
@@ -896,6 +899,9 @@ class Controller:
|
|
|
896
899
|
# some data here.
|
|
897
900
|
raise error
|
|
898
901
|
|
|
902
|
+
# Use context.contextual to enable per-job output redirection and env var
|
|
903
|
+
# isolation.
|
|
904
|
+
@context.contextual
|
|
899
905
|
async def run_job_loop(self,
|
|
900
906
|
job_id: int,
|
|
901
907
|
dag_yaml: str,
|
|
@@ -904,13 +910,9 @@ class Controller:
|
|
|
904
910
|
env_file_path: Optional[str] = None,
|
|
905
911
|
pool: Optional[str] = None):
|
|
906
912
|
"""Background task that runs the job loop."""
|
|
907
|
-
# Replace os.environ with ContextualEnviron to enable per-job
|
|
908
|
-
# environment isolation. This allows each job to have its own
|
|
909
|
-
# environment variables without affecting other jobs or the main
|
|
910
|
-
# process.
|
|
911
|
-
context.initialize()
|
|
912
913
|
ctx = context.get()
|
|
913
|
-
ctx
|
|
914
|
+
assert ctx is not None, 'Context is not initialized'
|
|
915
|
+
ctx.redirect_log(pathlib.Path(log_file))
|
|
914
916
|
|
|
915
917
|
# Load and apply environment variables from the job's environment file
|
|
916
918
|
if env_file_path and os.path.exists(env_file_path):
|
|
@@ -921,13 +923,15 @@ class Controller:
|
|
|
921
923
|
f'{list(env_vars.keys())}')
|
|
922
924
|
|
|
923
925
|
# Apply environment variables to the job's context
|
|
924
|
-
ctx = context.get()
|
|
925
926
|
if ctx is not None:
|
|
926
927
|
for key, value in env_vars.items():
|
|
927
928
|
if value is not None:
|
|
928
929
|
ctx.override_envs({key: value})
|
|
929
930
|
job_logger.debug(
|
|
930
931
|
f'Set environment variable: {key}={value}')
|
|
932
|
+
# Reload the skypilot config for this context to make sure
|
|
933
|
+
# the latest config is used.
|
|
934
|
+
skypilot_config.reload_config()
|
|
931
935
|
else:
|
|
932
936
|
job_logger.error(
|
|
933
937
|
'Context is None, cannot set environment variables')
|
sky/jobs/server/core.py
CHANGED
|
@@ -281,8 +281,7 @@ def launch(
|
|
|
281
281
|
# Check whether cached jobs controller cluster is accessible
|
|
282
282
|
cluster_name = (
|
|
283
283
|
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
|
|
284
|
-
|
|
285
|
-
if record is not None:
|
|
284
|
+
if global_user_state.cluster_with_name_exists(cluster_name):
|
|
286
285
|
# there is a cached jobs controller cluster
|
|
287
286
|
try:
|
|
288
287
|
# TODO: do something with returned status?
|
|
@@ -369,6 +368,8 @@ def launch(
|
|
|
369
368
|
'priority': priority,
|
|
370
369
|
'consolidation_mode_job_id': consolidation_mode_job_id,
|
|
371
370
|
'pool': pool,
|
|
371
|
+
'job_controller_indicator_file':
|
|
372
|
+
managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
|
|
372
373
|
**controller_utils.shared_controller_vars_to_fill(
|
|
373
374
|
controller,
|
|
374
375
|
remote_user_config_path=remote_user_config_path,
|
|
@@ -959,9 +960,10 @@ def pool_apply(
|
|
|
959
960
|
task: 'sky.Task',
|
|
960
961
|
pool_name: str,
|
|
961
962
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
|
|
963
|
+
workers: Optional[int] = None,
|
|
962
964
|
) -> None:
|
|
963
965
|
"""Apply a config to a pool."""
|
|
964
|
-
return impl.apply(task, pool_name, mode, pool=True)
|
|
966
|
+
return impl.apply(task, workers, pool_name, mode, pool=True)
|
|
965
967
|
|
|
966
968
|
|
|
967
969
|
@usage_lib.entrypoint
|
sky/jobs/server/server.py
CHANGED
|
@@ -94,23 +94,27 @@ async def logs(
|
|
|
94
94
|
request: fastapi.Request, jobs_logs_body: payloads.JobsLogsBody,
|
|
95
95
|
background_tasks: fastapi.BackgroundTasks
|
|
96
96
|
) -> fastapi.responses.StreamingResponse:
|
|
97
|
-
|
|
97
|
+
schedule_type = api_requests.ScheduleType.SHORT
|
|
98
|
+
if jobs_logs_body.refresh:
|
|
99
|
+
# When refresh is specified, the job controller might be restarted,
|
|
100
|
+
# which takes longer time to finish. We schedule it to long executor.
|
|
101
|
+
schedule_type = api_requests.ScheduleType.LONG
|
|
102
|
+
request_task = executor.prepare_request(
|
|
98
103
|
request_id=request.state.request_id,
|
|
99
104
|
request_name='jobs.logs',
|
|
100
105
|
request_body=jobs_logs_body,
|
|
101
106
|
func=core.tail_logs,
|
|
102
|
-
|
|
103
|
-
# should be responsive. However, it can be long running if the user's
|
|
104
|
-
# job keeps running, and we should avoid it taking the SHORT worker
|
|
105
|
-
# indefinitely.
|
|
106
|
-
# When refresh is True we schedule it as LONG because a controller
|
|
107
|
-
# restart might be needed.
|
|
108
|
-
schedule_type=api_requests.ScheduleType.LONG
|
|
109
|
-
if jobs_logs_body.refresh else api_requests.ScheduleType.SHORT,
|
|
107
|
+
schedule_type=schedule_type,
|
|
110
108
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
111
109
|
)
|
|
112
|
-
|
|
113
|
-
|
|
110
|
+
if schedule_type == api_requests.ScheduleType.LONG:
|
|
111
|
+
executor.schedule_prepared_request(request_task)
|
|
112
|
+
else:
|
|
113
|
+
# For short request, run in the coroutine to avoid blocking
|
|
114
|
+
# short workers.
|
|
115
|
+
task = executor.execute_request_in_coroutine(request_task)
|
|
116
|
+
# Cancel the coroutine after the request is done or client disconnects
|
|
117
|
+
background_tasks.add_task(task.cancel)
|
|
114
118
|
|
|
115
119
|
return stream_utils.stream_response(
|
|
116
120
|
request_id=request_task.request_id,
|
sky/jobs/utils.py
CHANGED
|
@@ -156,7 +156,7 @@ def _validate_consolidation_mode_config(
|
|
|
156
156
|
if current_is_consolidation_mode:
|
|
157
157
|
controller_cn = (
|
|
158
158
|
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
|
|
159
|
-
if global_user_state.
|
|
159
|
+
if global_user_state.cluster_with_name_exists(controller_cn):
|
|
160
160
|
with ux_utils.print_exception_no_traceback():
|
|
161
161
|
raise exceptions.InconsistentConsolidationModeError(
|
|
162
162
|
f'{colorama.Fore.RED}Consolidation mode for jobs is '
|
sky/logs/agent.py
CHANGED
|
@@ -34,7 +34,8 @@ class FluentbitAgent(LoggingAgent):
|
|
|
34
34
|
def get_setup_command(self,
|
|
35
35
|
cluster_name: resources_utils.ClusterName) -> str:
|
|
36
36
|
install_cmd = (
|
|
37
|
-
|
|
37
|
+
# pylint: disable=line-too-long
|
|
38
|
+
'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
|
|
38
39
|
'sudo apt-get update; sudo apt-get install -y gnupg; '
|
|
39
40
|
# pylint: disable=line-too-long
|
|
40
41
|
'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
|
|
@@ -51,14 +52,32 @@ class FluentbitAgent(LoggingAgent):
|
|
|
51
52
|
cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
|
|
52
53
|
config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
|
|
53
54
|
f'echo {shlex.quote(cfg)} > {cfg_path}')
|
|
55
|
+
kill_prior_cmd = (
|
|
56
|
+
'if [ -f "/tmp/fluentbit.pid" ]; then '
|
|
57
|
+
# pylint: disable=line-too-long
|
|
58
|
+
'echo "Killing prior fluent-bit process $(cat /tmp/fluentbit.pid)"; '
|
|
59
|
+
'kill "$(cat /tmp/fluentbit.pid)" || true; '
|
|
60
|
+
'fi')
|
|
54
61
|
start_cmd = ('nohup $(command -v fluent-bit || '
|
|
55
62
|
'echo "/opt/fluent-bit/bin/fluent-bit") '
|
|
56
|
-
f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 &'
|
|
57
|
-
|
|
63
|
+
f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 & '
|
|
64
|
+
'echo $! > /tmp/fluentbit.pid')
|
|
65
|
+
return ('set -e; '
|
|
66
|
+
f'{install_cmd}; '
|
|
67
|
+
f'{config_cmd}; '
|
|
68
|
+
f'{kill_prior_cmd}; '
|
|
69
|
+
f'{start_cmd}')
|
|
58
70
|
|
|
59
71
|
def fluentbit_config(self,
|
|
60
72
|
cluster_name: resources_utils.ClusterName) -> str:
|
|
61
73
|
cfg_dict = {
|
|
74
|
+
'parsers': [{
|
|
75
|
+
'name': 'sky-ray-parser',
|
|
76
|
+
'format': 'regex',
|
|
77
|
+
# pylint: disable=line-too-long
|
|
78
|
+
'regex': r'(?:\x1b\[[\d;]+m)?\((?<worker_name>[^,]+)(?:,\s*rank=(?<rank>\d+))?(?:,\s*pid=(?<pid>\d+))(?:,\s*ip=(?<ip>[\d.]+))?\)(?:\x1b\[[\d;]+m)?\s*(?<log_line>.*)',
|
|
79
|
+
'types': 'rank:integer pid:integer',
|
|
80
|
+
}],
|
|
62
81
|
'pipeline': {
|
|
63
82
|
'inputs': [{
|
|
64
83
|
'name': 'tail',
|
|
@@ -70,6 +89,14 @@ class FluentbitAgent(LoggingAgent):
|
|
|
70
89
|
# right after the job completion.
|
|
71
90
|
'refresh_interval': 1,
|
|
72
91
|
}],
|
|
92
|
+
'filters': [{
|
|
93
|
+
'name': 'parser',
|
|
94
|
+
'match': '*',
|
|
95
|
+
'key_name': 'log',
|
|
96
|
+
'parser': 'sky-ray-parser',
|
|
97
|
+
'preserve_key': 'on', # preserve field for backwards compat
|
|
98
|
+
'reserve_data': 'on',
|
|
99
|
+
}],
|
|
73
100
|
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
74
101
|
}
|
|
75
102
|
}
|
sky/logs/aws.py
CHANGED
|
@@ -5,7 +5,6 @@ from typing import Any, Dict, Optional
|
|
|
5
5
|
import pydantic
|
|
6
6
|
|
|
7
7
|
from sky.logs.agent import FluentbitAgent
|
|
8
|
-
from sky.skylet import constants
|
|
9
8
|
from sky.utils import resources_utils
|
|
10
9
|
from sky.utils import yaml_utils
|
|
11
10
|
|
|
@@ -176,6 +175,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
176
175
|
Returns:
|
|
177
176
|
The Fluent Bit configuration as a YAML string.
|
|
178
177
|
"""
|
|
178
|
+
cfg_dict = yaml_utils.read_yaml_str(
|
|
179
|
+
super().fluentbit_config(cluster_name))
|
|
179
180
|
display_name = cluster_name.display_name
|
|
180
181
|
unique_name = cluster_name.name_on_cloud
|
|
181
182
|
# Build tags for the log stream
|
|
@@ -197,24 +198,13 @@ class CloudwatchLoggingAgent(FluentbitAgent):
|
|
|
197
198
|
'value': value
|
|
198
199
|
})
|
|
199
200
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
# job creates a new log file and we must be responsive
|
|
208
|
-
# for this: the VM might be autodown within a minute
|
|
209
|
-
# right after the job completion.
|
|
210
|
-
'refresh_interval': 1,
|
|
211
|
-
'processors': {
|
|
212
|
-
'logs': log_processors,
|
|
213
|
-
}
|
|
214
|
-
}],
|
|
215
|
-
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
216
|
-
}
|
|
217
|
-
}
|
|
201
|
+
# Add log processors to config
|
|
202
|
+
processors_config = cfg_dict['pipeline']['inputs'][0].get(
|
|
203
|
+
'processors', {})
|
|
204
|
+
processors_logs_config = processors_config.get('logs', [])
|
|
205
|
+
processors_logs_config.extend(log_processors)
|
|
206
|
+
processors_config['logs'] = processors_logs_config
|
|
207
|
+
cfg_dict['pipeline']['inputs'][0]['processors'] = processors_config
|
|
218
208
|
|
|
219
209
|
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
220
210
|
|
sky/provision/__init__.py
CHANGED
|
@@ -168,7 +168,8 @@ def map_all_volumes_usedby(
|
|
|
168
168
|
|
|
169
169
|
|
|
170
170
|
@_route_to_cloud_impl
|
|
171
|
-
def run_instances(provider_name: str, region: str,
|
|
171
|
+
def run_instances(provider_name: str, region: str, cluster_name: str,
|
|
172
|
+
cluster_name_on_cloud: str,
|
|
172
173
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
173
174
|
"""Start instances with bootstrapped configuration."""
|
|
174
175
|
raise NotImplementedError
|
sky/provision/aws/instance.py
CHANGED
|
@@ -311,9 +311,10 @@ def _get_head_instance_id(instances: List) -> Optional[str]:
|
|
|
311
311
|
return head_instance_id
|
|
312
312
|
|
|
313
313
|
|
|
314
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
314
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
315
315
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
316
316
|
"""See sky/provision/__init__.py"""
|
|
317
|
+
del cluster_name # unused
|
|
317
318
|
ec2 = _default_ec2_resource(region)
|
|
318
319
|
# NOTE: We set max_attempts=0 for fast failing when the resource is not
|
|
319
320
|
# available (although the doc says it will only retry for network
|
sky/provision/azure/instance.py
CHANGED
|
@@ -362,9 +362,10 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
|
|
|
362
362
|
return instances
|
|
363
363
|
|
|
364
364
|
|
|
365
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
365
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
366
366
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
367
367
|
"""See sky/provision/__init__.py"""
|
|
368
|
+
del cluster_name # unused
|
|
368
369
|
# TODO(zhwu): This function is too long. We should refactor it.
|
|
369
370
|
provider_config = config.provider_config
|
|
370
371
|
resource_group = provider_config['resource_group']
|
sky/provision/cudo/instance.py
CHANGED
|
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
40
40
|
return head_instance_id
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
43
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
44
44
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
45
45
|
"""Runs instances for the given cluster."""
|
|
46
|
-
|
|
46
|
+
del cluster_name # unused
|
|
47
47
|
pending_status = ['pend', 'init', 'prol', 'boot']
|
|
48
48
|
|
|
49
49
|
while True:
|
sky/provision/do/instance.py
CHANGED
|
@@ -26,10 +26,10 @@ def _get_head_instance(
|
|
|
26
26
|
return None
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
29
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
30
30
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
31
31
|
"""Runs instances for the given cluster."""
|
|
32
|
-
|
|
32
|
+
del cluster_name # unused
|
|
33
33
|
pending_status = ['new']
|
|
34
34
|
newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
|
|
35
35
|
pending_status + ['off'])
|
sky/provision/docker_utils.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import shlex
|
|
5
5
|
import time
|
|
6
|
-
from typing import Any, Dict, List
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
from sky import sky_logging
|
|
9
9
|
from sky.skylet import constants
|
|
@@ -192,12 +192,16 @@ class DockerInitializer:
|
|
|
192
192
|
self.docker_cmd = 'podman' if use_podman else 'docker'
|
|
193
193
|
self.log_path = log_path
|
|
194
194
|
|
|
195
|
-
def _run(
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
195
|
+
def _run(
|
|
196
|
+
self,
|
|
197
|
+
cmd,
|
|
198
|
+
run_env='host',
|
|
199
|
+
wait_for_docker_daemon: bool = False,
|
|
200
|
+
separate_stderr: bool = False,
|
|
201
|
+
log_err_when_fail: bool = True,
|
|
202
|
+
flock_name: Optional[str] = None,
|
|
203
|
+
flock_args: Optional[str] = None,
|
|
204
|
+
) -> str:
|
|
201
205
|
|
|
202
206
|
if run_env == 'docker':
|
|
203
207
|
cmd = self._docker_expand_user(cmd, any_char=True)
|
|
@@ -206,8 +210,13 @@ class DockerInitializer:
|
|
|
206
210
|
# an error: `the input device is not a TTY`, and it works without
|
|
207
211
|
# `-it` flag.
|
|
208
212
|
# TODO(zhwu): ray use the `-it` flag, we need to check why.
|
|
209
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
210
|
-
f' {shlex.quote(cmd)} ')
|
|
213
|
+
cmd = (f'{self.docker_cmd} exec -u 0 {self.container_name}'
|
|
214
|
+
f' /bin/bash -c {shlex.quote(cmd)} ')
|
|
215
|
+
|
|
216
|
+
if flock_name is not None:
|
|
217
|
+
flock_args = flock_args or ''
|
|
218
|
+
cmd = (f'flock {flock_args} /tmp/{flock_name} '
|
|
219
|
+
f'-c {shlex.quote(cmd)}')
|
|
211
220
|
|
|
212
221
|
logger.debug(f'+ {cmd}')
|
|
213
222
|
start = time.time()
|
|
@@ -259,7 +268,10 @@ class DockerInitializer:
|
|
|
259
268
|
if self._check_container_exited():
|
|
260
269
|
self.initialized = True
|
|
261
270
|
self._run(f'{self.docker_cmd} start {self.container_name}')
|
|
262
|
-
self._run('sudo service ssh start',
|
|
271
|
+
self._run('sudo service ssh start',
|
|
272
|
+
run_env='docker',
|
|
273
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
274
|
+
flock_args='-s -w 1')
|
|
263
275
|
return self._run('whoami', run_env='docker')
|
|
264
276
|
|
|
265
277
|
# SkyPilot: Docker login if user specified a private docker registry.
|
|
@@ -358,7 +370,9 @@ class DockerInitializer:
|
|
|
358
370
|
self._auto_configure_shm(user_docker_run_options)),
|
|
359
371
|
self.docker_cmd,
|
|
360
372
|
)
|
|
361
|
-
self._run(f'{remove_container_cmd}
|
|
373
|
+
self._run(f'{remove_container_cmd} && {start_command}',
|
|
374
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
375
|
+
flock_args='-x -w 10')
|
|
362
376
|
|
|
363
377
|
# SkyPilot: Setup Commands.
|
|
364
378
|
# TODO(zhwu): the following setups should be aligned with the kubernetes
|
|
@@ -376,14 +390,18 @@ class DockerInitializer:
|
|
|
376
390
|
'echo "export DEBIAN_FRONTEND=noninteractive" >> ~/.bashrc;',
|
|
377
391
|
run_env='docker')
|
|
378
392
|
# Install dependencies.
|
|
379
|
-
|
|
380
|
-
'
|
|
393
|
+
cmd = (
|
|
394
|
+
'bash -lc \''
|
|
395
|
+
'exec 200>/var/tmp/sky_apt.lock; '
|
|
396
|
+
'flock -x -w 120 200 || exit 1; '
|
|
397
|
+
'export DEBIAN_FRONTEND=noninteractive; '
|
|
398
|
+
'apt-get -yq update && '
|
|
381
399
|
# Our mount script will install gcsfuse without fuse package.
|
|
382
400
|
# We need to install fuse package first to enable storage mount.
|
|
383
401
|
# The dpkg option is to suppress the prompt for fuse installation.
|
|
384
|
-
'
|
|
385
|
-
'rsync curl wget patch openssh-server python3-pip fuse
|
|
386
|
-
|
|
402
|
+
'apt-get -o DPkg::Options::=--force-confnew install -y '
|
|
403
|
+
'rsync curl wget patch openssh-server python3-pip fuse\'')
|
|
404
|
+
self._run(cmd, run_env='docker')
|
|
387
405
|
|
|
388
406
|
# Copy local authorized_keys to docker container.
|
|
389
407
|
# Stop and disable jupyter service. This is to avoid port conflict on
|
|
@@ -459,9 +477,13 @@ class DockerInitializer:
|
|
|
459
477
|
user_pos = string.find('~')
|
|
460
478
|
if user_pos > -1:
|
|
461
479
|
if self.home_dir is None:
|
|
462
|
-
cmd = (f'{self.docker_cmd} exec {self.container_name}
|
|
463
|
-
'printenv HOME')
|
|
464
|
-
self.home_dir = self._run(
|
|
480
|
+
cmd = (f'{self.docker_cmd} exec {self.container_name}'
|
|
481
|
+
' printenv HOME')
|
|
482
|
+
self.home_dir = self._run(
|
|
483
|
+
cmd,
|
|
484
|
+
separate_stderr=True,
|
|
485
|
+
flock_name=f'{self.container_name}.sky.lifecycle.lock',
|
|
486
|
+
flock_args='-s -w 1')
|
|
465
487
|
# Check for unexpected newline in home directory, which can be
|
|
466
488
|
# a common issue when the output is mixed with stderr.
|
|
467
489
|
assert '\n' not in self.home_dir, (
|
|
@@ -78,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
78
78
|
return head_instance_id
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
81
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
82
82
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
83
83
|
"""Runs instances for the given cluster."""
|
|
84
|
-
|
|
84
|
+
del cluster_name # unused
|
|
85
85
|
pending_status = ['pending', 'provisioning']
|
|
86
86
|
while True:
|
|
87
87
|
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
sky/provision/gcp/instance.py
CHANGED
|
@@ -360,9 +360,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
360
360
|
created_instance_ids=created_instance_ids)
|
|
361
361
|
|
|
362
362
|
|
|
363
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
363
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
364
364
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
365
365
|
"""See sky/provision/__init__.py"""
|
|
366
|
+
del cluster_name # unused
|
|
366
367
|
try:
|
|
367
368
|
return _run_instances(region, cluster_name_on_cloud, config)
|
|
368
369
|
except gcp.http_error_exception() as e:
|
|
@@ -64,8 +64,9 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
64
64
|
return next(iter(instances.keys()))
|
|
65
65
|
|
|
66
66
|
|
|
67
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
67
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
68
68
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
69
|
+
del cluster_name # unused
|
|
69
70
|
logger.info(f'Starting run_instances with region={region}, '
|
|
70
71
|
f'cluster={cluster_name_on_cloud}')
|
|
71
72
|
logger.debug(f'Config: {config}')
|
sky/provision/instance_setup.py
CHANGED
|
@@ -84,7 +84,7 @@ def _set_usage_run_id_cmd() -> str:
|
|
|
84
84
|
latest one when the function is called.
|
|
85
85
|
"""
|
|
86
86
|
return (
|
|
87
|
-
f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
|
|
87
|
+
f'cat {usage_constants.USAGE_RUN_ID_FILE} 2> /dev/null || '
|
|
88
88
|
# The run id is retrieved locally for the current run, so that the
|
|
89
89
|
# remote cluster will be set with the same run id as the initial
|
|
90
90
|
# launch operation.
|