skypilot-nightly 1.0.0.dev20251029__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/client/cli/command.py +47 -23
- sky/clouds/aws.py +59 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/{webpack-485984ca04e021d0.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +32 -2
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +12 -11
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +60 -22
- sky/metrics/utils.py +18 -0
- sky/schemas/api/responses.py +1 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/request_names.py +80 -0
- sky/server/server.py +103 -35
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +4 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +36 -36
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +67 -62
- sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +0 -26
- /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Utilities for managing managed job file content.
|
|
2
|
+
|
|
3
|
+
The helpers in this module fetch job file content (DAG YAML/env files) from the
|
|
4
|
+
database-first storage added for managed jobs, transparently falling back to
|
|
5
|
+
legacy on-disk paths when needed. Consumers should prefer the string-based
|
|
6
|
+
helpers so controllers never have to rely on local disk state.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from sky import sky_logging
|
|
13
|
+
from sky.jobs import state as managed_job_state
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_job_dag_content(job_id: int) -> Optional[str]:
|
|
19
|
+
"""Get DAG YAML content for a job from database or disk.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
job_id: The job ID
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
DAG YAML content as string, or None if not found
|
|
26
|
+
"""
|
|
27
|
+
file_info = managed_job_state.get_job_file_contents(job_id)
|
|
28
|
+
|
|
29
|
+
# Prefer content stored in the database
|
|
30
|
+
if file_info['dag_yaml_content'] is not None:
|
|
31
|
+
return file_info['dag_yaml_content']
|
|
32
|
+
|
|
33
|
+
# Fallback to disk path for backward compatibility
|
|
34
|
+
dag_yaml_path = file_info.get('dag_yaml_path')
|
|
35
|
+
if dag_yaml_path and os.path.exists(dag_yaml_path):
|
|
36
|
+
try:
|
|
37
|
+
with open(dag_yaml_path, 'r', encoding='utf-8') as f:
|
|
38
|
+
content = f.read()
|
|
39
|
+
logger.debug('Loaded DAG YAML from disk for job %s: %s', job_id,
|
|
40
|
+
dag_yaml_path)
|
|
41
|
+
return content
|
|
42
|
+
except (FileNotFoundError, IOError, OSError) as e:
|
|
43
|
+
logger.warning(
|
|
44
|
+
f'Failed to read DAG YAML from disk {dag_yaml_path}: {e}')
|
|
45
|
+
|
|
46
|
+
logger.warning(f'DAG YAML content not found for job {job_id}')
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_job_env_content(job_id: int) -> Optional[str]:
|
|
51
|
+
"""Get environment file content for a job from database or disk.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
job_id: The job ID
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Environment file content as string, or None if not found
|
|
58
|
+
"""
|
|
59
|
+
file_info = managed_job_state.get_job_file_contents(job_id)
|
|
60
|
+
|
|
61
|
+
# Prefer content stored in the database
|
|
62
|
+
if file_info['env_file_content'] is not None:
|
|
63
|
+
return file_info['env_file_content']
|
|
64
|
+
|
|
65
|
+
# Fallback to disk path for backward compatibility
|
|
66
|
+
env_file_path = file_info.get('env_file_path')
|
|
67
|
+
if env_file_path and os.path.exists(env_file_path):
|
|
68
|
+
try:
|
|
69
|
+
with open(env_file_path, 'r', encoding='utf-8') as f:
|
|
70
|
+
content = f.read()
|
|
71
|
+
logger.debug('Loaded environment file from disk for job %s: %s',
|
|
72
|
+
job_id, env_file_path)
|
|
73
|
+
return content
|
|
74
|
+
except (FileNotFoundError, IOError, OSError) as e:
|
|
75
|
+
logger.warning(
|
|
76
|
+
f'Failed to read environment file from disk {env_file_path}: '
|
|
77
|
+
f'{e}')
|
|
78
|
+
|
|
79
|
+
# Environment file is optional, so don't warn if not found
|
|
80
|
+
return None
|
sky/jobs/log_gc.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Log garbage collection for managed jobs."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import os
|
|
6
|
+
import pathlib
|
|
7
|
+
import shutil
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
import anyio
|
|
11
|
+
import filelock
|
|
12
|
+
|
|
13
|
+
from sky import sky_logging
|
|
14
|
+
from sky import skypilot_config
|
|
15
|
+
from sky.jobs import constants as managed_job_constants
|
|
16
|
+
from sky.jobs import state as managed_job_state
|
|
17
|
+
from sky.jobs import utils as managed_job_utils
|
|
18
|
+
from sky.utils import context
|
|
19
|
+
from sky.utils import context_utils
|
|
20
|
+
|
|
21
|
+
logger = sky_logging.init_logger(__name__)
|
|
22
|
+
|
|
23
|
+
# Filelock for garbage collector leader election.
|
|
24
|
+
_JOB_CONTROLLER_GC_LOCK_PATH = os.path.expanduser(
|
|
25
|
+
'~/.sky/locks/job_controller_gc.lock')
|
|
26
|
+
|
|
27
|
+
_DEFAULT_TASK_LOGS_GC_RETENTION_HOURS = 24 * 7
|
|
28
|
+
_DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS = 24 * 7
|
|
29
|
+
|
|
30
|
+
_LEAST_FREQUENT_GC_INTERVAL_SECONDS = 3600
|
|
31
|
+
_MOST_FREQUENT_GC_INTERVAL_SECONDS = 30
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _next_gc_interval(retention_seconds: int) -> int:
|
|
35
|
+
"""Get the next GC interval."""
|
|
36
|
+
# Run the GC at least per hour to ensure hourly accuracy and
|
|
37
|
+
# at most per 30 seconds (when retention_seconds is small) to
|
|
38
|
+
# avoid too frequent cleanup.
|
|
39
|
+
return max(min(retention_seconds, _LEAST_FREQUENT_GC_INTERVAL_SECONDS),
|
|
40
|
+
_MOST_FREQUENT_GC_INTERVAL_SECONDS)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def gc_controller_logs_for_job():
|
|
44
|
+
"""Garbage collect job and controller logs."""
|
|
45
|
+
while True:
|
|
46
|
+
skypilot_config.reload_config()
|
|
47
|
+
controller_logs_retention = skypilot_config.get_nested(
|
|
48
|
+
('jobs', 'controller', 'controller_logs_gc_retention_hours'),
|
|
49
|
+
_DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS) * 3600
|
|
50
|
+
# Negative value disables the GC
|
|
51
|
+
if controller_logs_retention >= 0:
|
|
52
|
+
logger.info(f'GC controller logs for job: retention '
|
|
53
|
+
f'{controller_logs_retention} seconds')
|
|
54
|
+
try:
|
|
55
|
+
finished = False
|
|
56
|
+
while not finished:
|
|
57
|
+
finished = await _clean_controller_logs_with_retention(
|
|
58
|
+
controller_logs_retention)
|
|
59
|
+
except asyncio.CancelledError:
|
|
60
|
+
logger.info('Managed jobs logs GC task cancelled')
|
|
61
|
+
break
|
|
62
|
+
except Exception as e: # pylint: disable=broad-except
|
|
63
|
+
logger.error(f'Error GC controller logs for job: {e}',
|
|
64
|
+
exc_info=True)
|
|
65
|
+
else:
|
|
66
|
+
logger.info('Controller logs GC is disabled')
|
|
67
|
+
|
|
68
|
+
interval = _next_gc_interval(controller_logs_retention)
|
|
69
|
+
logger.info('Next controller logs GC is scheduled after '
|
|
70
|
+
f'{interval} seconds')
|
|
71
|
+
await asyncio.sleep(interval)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def gc_task_logs_for_job():
|
|
75
|
+
"""Garbage collect task logs for job."""
|
|
76
|
+
while True:
|
|
77
|
+
skypilot_config.reload_config()
|
|
78
|
+
task_logs_retention = skypilot_config.get_nested(
|
|
79
|
+
('jobs', 'controller', 'task_logs_gc_retention_hours'),
|
|
80
|
+
_DEFAULT_TASK_LOGS_GC_RETENTION_HOURS) * 3600
|
|
81
|
+
# Negative value disables the GC
|
|
82
|
+
if task_logs_retention >= 0:
|
|
83
|
+
logger.info('GC task logs for job: '
|
|
84
|
+
f'retention {task_logs_retention} seconds')
|
|
85
|
+
try:
|
|
86
|
+
finished = False
|
|
87
|
+
while not finished:
|
|
88
|
+
finished = await _clean_task_logs_with_retention(
|
|
89
|
+
task_logs_retention)
|
|
90
|
+
except asyncio.CancelledError:
|
|
91
|
+
logger.info('Task logs GC task cancelled')
|
|
92
|
+
break
|
|
93
|
+
except Exception as e: # pylint: disable=broad-except
|
|
94
|
+
logger.error(f'Error GC task logs for job: {e}', exc_info=True)
|
|
95
|
+
else:
|
|
96
|
+
logger.info('Controller logs GC is disabled')
|
|
97
|
+
|
|
98
|
+
interval = _next_gc_interval(task_logs_retention)
|
|
99
|
+
logger.info(f'Next task logs GC is scheduled after {interval} seconds')
|
|
100
|
+
await asyncio.sleep(_next_gc_interval(task_logs_retention))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
async def _clean_controller_logs_with_retention(retention_seconds: int,
|
|
104
|
+
batch_size: int = 100):
|
|
105
|
+
"""Clean controller logs with retention.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Whether the GC of this round has finished, False means there might
|
|
109
|
+
still be more controller logs to clean.
|
|
110
|
+
"""
|
|
111
|
+
assert batch_size > 0, 'Batch size must be positive'
|
|
112
|
+
jobs = await managed_job_state.get_controller_logs_to_clean_async(
|
|
113
|
+
retention_seconds, batch_size=batch_size)
|
|
114
|
+
job_ids_to_update = []
|
|
115
|
+
for job in jobs:
|
|
116
|
+
job_ids_to_update.append(job['job_id'])
|
|
117
|
+
log_file = managed_job_utils.controller_log_file_for_job(job['job_id'])
|
|
118
|
+
cleaned_at = time.time()
|
|
119
|
+
if await anyio.Path(log_file).exists():
|
|
120
|
+
ts_str = datetime.fromtimestamp(cleaned_at).strftime(
|
|
121
|
+
'%Y-%m-%d %H:%M:%S')
|
|
122
|
+
msg = f'Controller log has been cleaned at {ts_str}.'
|
|
123
|
+
# Sync down logs will reference to this file directly, so we
|
|
124
|
+
# keep the file and delete the content.
|
|
125
|
+
# TODO(aylei): refactor sync down logs if the inode usage
|
|
126
|
+
# becomes an issue.
|
|
127
|
+
async with await anyio.open_file(log_file, 'w',
|
|
128
|
+
encoding='utf-8') as f:
|
|
129
|
+
await f.write(msg + '\n')
|
|
130
|
+
# Batch the update, the timestamp will be not accurate but it's okay.
|
|
131
|
+
await managed_job_state.set_controller_logs_cleaned_async(
|
|
132
|
+
job_ids=job_ids_to_update, logs_cleaned_at=time.time())
|
|
133
|
+
complete = len(jobs) < batch_size
|
|
134
|
+
logger.info(f'Cleaned {len(jobs)} controller logs with retention '
|
|
135
|
+
f'{retention_seconds} seconds, complete: {complete}')
|
|
136
|
+
return complete
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
async def _clean_task_logs_with_retention(retention_seconds: int,
|
|
140
|
+
batch_size: int = 100):
|
|
141
|
+
"""Clean task logs with retention.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Whether the GC of this round has finished, False means there might
|
|
145
|
+
still be more task logs to clean.
|
|
146
|
+
"""
|
|
147
|
+
assert batch_size > 0, 'Batch size must be positive'
|
|
148
|
+
tasks = await managed_job_state.get_task_logs_to_clean_async(
|
|
149
|
+
retention_seconds, batch_size=batch_size)
|
|
150
|
+
tasks_to_update = []
|
|
151
|
+
for task in tasks:
|
|
152
|
+
local_log_file = anyio.Path(task['local_log_file'])
|
|
153
|
+
# We assume the log directory has the following layout:
|
|
154
|
+
# task-id/
|
|
155
|
+
# - run.log
|
|
156
|
+
# - tasks/
|
|
157
|
+
# - run.log
|
|
158
|
+
# and also remove the tasks directory on cleanup.
|
|
159
|
+
task_log_dir = local_log_file.parent.joinpath('tasks')
|
|
160
|
+
await local_log_file.unlink(missing_ok=True)
|
|
161
|
+
await context_utils.to_thread(shutil.rmtree,
|
|
162
|
+
str(task_log_dir),
|
|
163
|
+
ignore_errors=True)
|
|
164
|
+
# We have at least once semantic guarantee for the cleanup here.
|
|
165
|
+
tasks_to_update.append((task['job_id'], task['task_id']))
|
|
166
|
+
await managed_job_state.set_task_logs_cleaned_async(
|
|
167
|
+
tasks=list(tasks_to_update), logs_cleaned_at=time.time())
|
|
168
|
+
complete = len(tasks) < batch_size
|
|
169
|
+
logger.info(f'Cleaned {len(tasks)} task logs with retention '
|
|
170
|
+
f'{retention_seconds} seconds, complete: {complete}')
|
|
171
|
+
return complete
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@context.contextual_async
|
|
175
|
+
async def run_log_gc():
|
|
176
|
+
"""Run the log garbage collector."""
|
|
177
|
+
log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
|
178
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
179
|
+
log_path = os.path.join(log_dir, 'garbage_collector.log')
|
|
180
|
+
# Remove previous log file
|
|
181
|
+
await anyio.Path(log_path).unlink(missing_ok=True)
|
|
182
|
+
ctx = context.get()
|
|
183
|
+
assert ctx is not None, 'Context is not initialized'
|
|
184
|
+
ctx.redirect_log(pathlib.Path(log_path))
|
|
185
|
+
gc_controller_logs_for_job_task = asyncio.create_task(
|
|
186
|
+
gc_controller_logs_for_job())
|
|
187
|
+
gc_task_logs_for_job_task = asyncio.create_task(gc_task_logs_for_job())
|
|
188
|
+
await asyncio.gather(gc_controller_logs_for_job_task,
|
|
189
|
+
gc_task_logs_for_job_task)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def elect_for_log_gc():
|
|
193
|
+
"""Use filelock to elect for the log garbage collector.
|
|
194
|
+
|
|
195
|
+
The log garbage collector runs in the controller process to avoid the
|
|
196
|
+
overhead of launching a new process and the lifecycle management, the
|
|
197
|
+
threads that does not elected as the log garbage collector just wait.
|
|
198
|
+
on the filelock and bring trivial overhead.
|
|
199
|
+
"""
|
|
200
|
+
with filelock.FileLock(_JOB_CONTROLLER_GC_LOCK_PATH):
|
|
201
|
+
asyncio.run(run_log_gc())
|
sky/jobs/scheduler.py
CHANGED
|
@@ -263,6 +263,7 @@ def maybe_start_controllers(from_scheduler: bool = False) -> None:
|
|
|
263
263
|
|
|
264
264
|
if started > 0:
|
|
265
265
|
logger.info(f'Started {started} controllers')
|
|
266
|
+
|
|
266
267
|
except filelock.Timeout:
|
|
267
268
|
# If we can't get the lock, just exit. The process holding the lock
|
|
268
269
|
# should launch any pending jobs.
|
|
@@ -289,8 +290,20 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
|
289
290
|
maybe_start_controllers(from_scheduler=True)
|
|
290
291
|
return
|
|
291
292
|
|
|
292
|
-
|
|
293
|
-
|
|
293
|
+
with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
|
|
294
|
+
dag_yaml_content = dag_file.read()
|
|
295
|
+
with open(original_user_yaml_path, 'r',
|
|
296
|
+
encoding='utf-8') as original_user_yaml_file:
|
|
297
|
+
original_user_yaml_content = original_user_yaml_file.read()
|
|
298
|
+
with open(env_file_path, 'r', encoding='utf-8') as env_file:
|
|
299
|
+
env_file_content = env_file.read()
|
|
300
|
+
logger.debug(f'Storing job {job_id} file contents in database '
|
|
301
|
+
f'(DAG bytes={len(dag_yaml_content)}, '
|
|
302
|
+
f'original user yaml bytes={len(original_user_yaml_content)}, '
|
|
303
|
+
f'env bytes={len(env_file_content)}).')
|
|
304
|
+
state.scheduler_set_waiting(job_id, dag_yaml_content,
|
|
305
|
+
original_user_yaml_content, env_file_content,
|
|
306
|
+
priority)
|
|
294
307
|
if state.get_ha_recovery_script(job_id) is None:
|
|
295
308
|
# the run command is just the command that called scheduler
|
|
296
309
|
run = (f'source {env_file_path} && '
|
sky/jobs/server/core.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
"""SDK functions for managed jobs."""
|
|
2
|
+
import concurrent.futures
|
|
3
|
+
import copy
|
|
2
4
|
import ipaddress
|
|
3
5
|
import os
|
|
4
6
|
import pathlib
|
|
@@ -60,6 +62,35 @@ else:
|
|
|
60
62
|
|
|
61
63
|
logger = sky_logging.init_logger(__name__)
|
|
62
64
|
|
|
65
|
+
_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
|
|
66
|
+
'job_id',
|
|
67
|
+
'task_id',
|
|
68
|
+
'workspace',
|
|
69
|
+
'job_name',
|
|
70
|
+
'task_name',
|
|
71
|
+
'resources',
|
|
72
|
+
'submitted_at',
|
|
73
|
+
'end_at',
|
|
74
|
+
'job_duration',
|
|
75
|
+
'recovery_count',
|
|
76
|
+
'status',
|
|
77
|
+
'pool',
|
|
78
|
+
'current_cluster_name',
|
|
79
|
+
'job_id_on_pool_cluster',
|
|
80
|
+
'start_at',
|
|
81
|
+
'infra',
|
|
82
|
+
'cloud',
|
|
83
|
+
'region',
|
|
84
|
+
'zone',
|
|
85
|
+
'cluster_resources',
|
|
86
|
+
'schedule_state',
|
|
87
|
+
'details',
|
|
88
|
+
'failure_reason',
|
|
89
|
+
'metadata',
|
|
90
|
+
'user_name',
|
|
91
|
+
'user_hash',
|
|
92
|
+
]
|
|
93
|
+
|
|
63
94
|
|
|
64
95
|
def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
|
|
65
96
|
"""Upload files to the controller.
|
|
@@ -357,12 +388,15 @@ def launch(
|
|
|
357
388
|
) as original_user_yaml_path:
|
|
358
389
|
original_user_yaml_path.write(user_dag_str_user_specified)
|
|
359
390
|
original_user_yaml_path.flush()
|
|
360
|
-
|
|
391
|
+
# Copy tasks to avoid race conditions when multiple threads modify
|
|
392
|
+
# the same dag object concurrently. Each thread needs its own copy.
|
|
393
|
+
dag_copy = copy.deepcopy(dag)
|
|
394
|
+
for task_ in dag_copy.tasks:
|
|
361
395
|
if job_rank is not None:
|
|
362
396
|
task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
|
|
363
397
|
task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
|
|
364
398
|
|
|
365
|
-
dag_utils.dump_chain_dag_to_yaml(
|
|
399
|
+
dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
|
|
366
400
|
|
|
367
401
|
vars_to_fill = {
|
|
368
402
|
'remote_original_user_yaml_path':
|
|
@@ -395,7 +429,8 @@ def launch(
|
|
|
395
429
|
|
|
396
430
|
yaml_path = os.path.join(
|
|
397
431
|
managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
|
|
398
|
-
f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml'
|
|
432
|
+
f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
|
|
433
|
+
)
|
|
399
434
|
common_utils.fill_template(
|
|
400
435
|
managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
|
|
401
436
|
vars_to_fill,
|
|
@@ -403,7 +438,7 @@ def launch(
|
|
|
403
438
|
controller_task = task_lib.Task.from_yaml(yaml_path)
|
|
404
439
|
controller_task.set_resources(controller_resources)
|
|
405
440
|
|
|
406
|
-
controller_task.managed_job_dag =
|
|
441
|
+
controller_task.managed_job_dag = dag_copy
|
|
407
442
|
# pylint: disable=protected-access
|
|
408
443
|
controller_task._metadata = metadata
|
|
409
444
|
|
|
@@ -472,15 +507,49 @@ def launch(
|
|
|
472
507
|
assert len(consolidation_mode_job_ids) == 1
|
|
473
508
|
return _submit_one(consolidation_mode_job_ids[0])
|
|
474
509
|
|
|
475
|
-
ids = []
|
|
476
|
-
all_handle = None
|
|
477
|
-
|
|
478
|
-
|
|
510
|
+
ids: List[int] = []
|
|
511
|
+
all_handle: Optional[backends.ResourceHandle] = None
|
|
512
|
+
|
|
513
|
+
if num_jobs == 1:
|
|
514
|
+
job_id = (consolidation_mode_job_ids[0]
|
|
479
515
|
if consolidation_mode_job_ids is not None else None)
|
|
480
|
-
jid, handle = _submit_one(job_id,
|
|
516
|
+
jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
|
|
481
517
|
assert jid is not None, (job_id, handle)
|
|
482
518
|
ids.append(jid)
|
|
483
519
|
all_handle = handle
|
|
520
|
+
else:
|
|
521
|
+
# Submit jobs in parallel using ThreadPoolExecutor
|
|
522
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
523
|
+
max_workers=min(num_jobs,
|
|
524
|
+
os.cpu_count() or 1)) as executor:
|
|
525
|
+
# Submit jobs concurrently
|
|
526
|
+
future_to_rank = {}
|
|
527
|
+
for job_rank in range(num_jobs):
|
|
528
|
+
job_id = (consolidation_mode_job_ids[job_rank]
|
|
529
|
+
if consolidation_mode_job_ids is not None else None)
|
|
530
|
+
future = executor.submit(_submit_one, job_id, job_rank,
|
|
531
|
+
num_jobs)
|
|
532
|
+
future_to_rank[future] = job_rank
|
|
533
|
+
|
|
534
|
+
# Collect results in order of job_rank to maintain consistent order.
|
|
535
|
+
results: List[Optional[Tuple[
|
|
536
|
+
int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
|
|
537
|
+
for future in concurrent.futures.as_completed(future_to_rank):
|
|
538
|
+
job_rank = future_to_rank[future]
|
|
539
|
+
try:
|
|
540
|
+
jid, handle = future.result()
|
|
541
|
+
assert jid is not None, (job_id, handle)
|
|
542
|
+
results[job_rank] = (jid, handle)
|
|
543
|
+
all_handle = handle # Keep the last handle.
|
|
544
|
+
except Exception as e:
|
|
545
|
+
logger.error(f'Error launching job {job_rank}: {e}')
|
|
546
|
+
raise e
|
|
547
|
+
|
|
548
|
+
# Extract job IDs in order
|
|
549
|
+
for res in results:
|
|
550
|
+
if res is not None:
|
|
551
|
+
ids.append(res[0])
|
|
552
|
+
|
|
484
553
|
return ids, all_handle
|
|
485
554
|
|
|
486
555
|
|
|
@@ -533,7 +602,8 @@ def queue_from_kubernetes_pod(
|
|
|
533
602
|
'kubernetes', cluster_info)[0]
|
|
534
603
|
|
|
535
604
|
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
536
|
-
skip_finished=skip_finished
|
|
605
|
+
skip_finished=skip_finished,
|
|
606
|
+
fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
|
|
537
607
|
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
|
538
608
|
code,
|
|
539
609
|
require_outputs=True,
|
|
@@ -646,8 +716,7 @@ def queue(refresh: bool,
|
|
|
646
716
|
does not exist.
|
|
647
717
|
RuntimeError: if failed to get the managed jobs with ssh.
|
|
648
718
|
"""
|
|
649
|
-
jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids
|
|
650
|
-
None, None, None, None, None, None)
|
|
719
|
+
jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
|
|
651
720
|
|
|
652
721
|
return jobs
|
|
653
722
|
|
|
@@ -764,7 +833,8 @@ def queue_v2(
|
|
|
764
833
|
try:
|
|
765
834
|
request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
766
835
|
skip_finished=skip_finished,
|
|
767
|
-
accessible_workspaces=
|
|
836
|
+
accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
|
|
837
|
+
workspaces=accessible_workspaces)),
|
|
768
838
|
job_ids=managed_jobsv1_pb2.JobIds(
|
|
769
839
|
ids=job_ids) if job_ids is not None else None,
|
|
770
840
|
workspace_match=workspace_match,
|
|
@@ -780,6 +850,8 @@ def queue_v2(
|
|
|
780
850
|
]) if user_hashes is not None else None,
|
|
781
851
|
statuses=managed_jobsv1_pb2.Statuses(
|
|
782
852
|
statuses=statuses) if statuses is not None else None,
|
|
853
|
+
fields=managed_jobsv1_pb2.Fields(
|
|
854
|
+
fields=fields) if fields is not None else None,
|
|
783
855
|
show_jobs_without_user_hash=show_jobs_without_user_hash,
|
|
784
856
|
)
|
|
785
857
|
response = backend_utils.invoke_skylet_with_retries(
|
sky/jobs/server/server.py
CHANGED
|
@@ -11,6 +11,7 @@ from sky.server import common as server_common
|
|
|
11
11
|
from sky.server import stream_utils
|
|
12
12
|
from sky.server.requests import executor
|
|
13
13
|
from sky.server.requests import payloads
|
|
14
|
+
from sky.server.requests import request_names
|
|
14
15
|
from sky.server.requests import requests as api_requests
|
|
15
16
|
from sky.skylet import constants
|
|
16
17
|
from sky.utils import common
|
|
@@ -37,7 +38,7 @@ async def launch(request: fastapi.Request,
|
|
|
37
38
|
if consolidation_mode else api_requests.ScheduleType.LONG)
|
|
38
39
|
await executor.schedule_request_async(
|
|
39
40
|
request_id=request.state.request_id,
|
|
40
|
-
request_name=
|
|
41
|
+
request_name=request_names.RequestName.JOBS_LAUNCH,
|
|
41
42
|
request_body=jobs_launch_body,
|
|
42
43
|
func=core.launch,
|
|
43
44
|
schedule_type=schedule_type,
|
|
@@ -52,7 +53,7 @@ async def queue(request: fastapi.Request,
|
|
|
52
53
|
jobs_queue_body: payloads.JobsQueueBody) -> None:
|
|
53
54
|
await executor.schedule_request_async(
|
|
54
55
|
request_id=request.state.request_id,
|
|
55
|
-
request_name=
|
|
56
|
+
request_name=request_names.RequestName.JOBS_QUEUE,
|
|
56
57
|
request_body=jobs_queue_body,
|
|
57
58
|
func=core.queue,
|
|
58
59
|
schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
|
|
@@ -66,7 +67,7 @@ async def queue_v2(request: fastapi.Request,
|
|
|
66
67
|
jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
|
|
67
68
|
await executor.schedule_request_async(
|
|
68
69
|
request_id=request.state.request_id,
|
|
69
|
-
request_name=
|
|
70
|
+
request_name=request_names.RequestName.JOBS_QUEUE_V2,
|
|
70
71
|
request_body=jobs_queue_body_v2,
|
|
71
72
|
func=core.queue_v2_api,
|
|
72
73
|
schedule_type=(api_requests.ScheduleType.LONG
|
|
@@ -81,7 +82,7 @@ async def cancel(request: fastapi.Request,
|
|
|
81
82
|
jobs_cancel_body: payloads.JobsCancelBody) -> None:
|
|
82
83
|
await executor.schedule_request_async(
|
|
83
84
|
request_id=request.state.request_id,
|
|
84
|
-
request_name=
|
|
85
|
+
request_name=request_names.RequestName.JOBS_CANCEL,
|
|
85
86
|
request_body=jobs_cancel_body,
|
|
86
87
|
func=core.cancel,
|
|
87
88
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -103,7 +104,7 @@ async def logs(
|
|
|
103
104
|
executor.check_request_thread_executor_available()
|
|
104
105
|
request_task = await executor.prepare_request_async(
|
|
105
106
|
request_id=request.state.request_id,
|
|
106
|
-
request_name=
|
|
107
|
+
request_name=request_names.RequestName.JOBS_LOGS,
|
|
107
108
|
request_body=jobs_logs_body,
|
|
108
109
|
func=core.tail_logs,
|
|
109
110
|
schedule_type=schedule_type,
|
|
@@ -143,7 +144,7 @@ async def download_logs(
|
|
|
143
144
|
jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
144
145
|
await executor.schedule_request_async(
|
|
145
146
|
request_id=request.state.request_id,
|
|
146
|
-
request_name=
|
|
147
|
+
request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
|
|
147
148
|
request_body=jobs_download_logs_body,
|
|
148
149
|
func=core.download_logs,
|
|
149
150
|
schedule_type=api_requests.ScheduleType.LONG
|
|
@@ -157,7 +158,7 @@ async def pool_apply(request: fastapi.Request,
|
|
|
157
158
|
jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
|
|
158
159
|
await executor.schedule_request_async(
|
|
159
160
|
request_id=request.state.request_id,
|
|
160
|
-
request_name=
|
|
161
|
+
request_name=request_names.RequestName.JOBS_POOL_APPLY,
|
|
161
162
|
request_body=jobs_pool_apply_body,
|
|
162
163
|
func=core.pool_apply,
|
|
163
164
|
schedule_type=api_requests.ScheduleType.LONG,
|
|
@@ -170,7 +171,7 @@ async def pool_down(request: fastapi.Request,
|
|
|
170
171
|
jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
|
|
171
172
|
await executor.schedule_request_async(
|
|
172
173
|
request_id=request.state.request_id,
|
|
173
|
-
request_name=
|
|
174
|
+
request_name=request_names.RequestName.JOBS_POOL_DOWN,
|
|
174
175
|
request_body=jobs_pool_down_body,
|
|
175
176
|
func=core.pool_down,
|
|
176
177
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -184,7 +185,7 @@ async def pool_status(
|
|
|
184
185
|
jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
|
|
185
186
|
await executor.schedule_request_async(
|
|
186
187
|
request_id=request.state.request_id,
|
|
187
|
-
request_name=
|
|
188
|
+
request_name=request_names.RequestName.JOBS_POOL_STATUS,
|
|
188
189
|
request_body=jobs_pool_status_body,
|
|
189
190
|
func=core.pool_status,
|
|
190
191
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -199,7 +200,7 @@ async def pool_tail_logs(
|
|
|
199
200
|
) -> fastapi.responses.StreamingResponse:
|
|
200
201
|
await executor.schedule_request_async(
|
|
201
202
|
request_id=request.state.request_id,
|
|
202
|
-
request_name=
|
|
203
|
+
request_name=request_names.RequestName.JOBS_POOL_LOGS,
|
|
203
204
|
request_body=log_body,
|
|
204
205
|
func=core.pool_tail_logs,
|
|
205
206
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -235,7 +236,7 @@ async def pool_download_logs(
|
|
|
235
236
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
236
237
|
await executor.schedule_request_async(
|
|
237
238
|
request_id=request.state.request_id,
|
|
238
|
-
request_name=
|
|
239
|
+
request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
|
|
239
240
|
request_body=download_logs_body,
|
|
240
241
|
func=core.pool_sync_down_logs,
|
|
241
242
|
schedule_type=api_requests.ScheduleType.SHORT,
|
sky/jobs/server/utils.py
CHANGED
|
@@ -19,6 +19,11 @@ else:
|
|
|
19
19
|
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
20
20
|
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
21
21
|
|
|
22
|
+
_MANAGED_JOB_FIELDS_TO_GET = [
|
|
23
|
+
'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
|
|
24
|
+
'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
|
|
25
|
+
]
|
|
26
|
+
|
|
22
27
|
|
|
23
28
|
def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
24
29
|
"""Check if controller has version mismatch and non-terminal jobs exist.
|
|
@@ -50,7 +55,11 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
50
55
|
)).get_managed_job_controller_version(version_request))
|
|
51
56
|
controller_version = version_response.controller_version
|
|
52
57
|
|
|
53
|
-
job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
58
|
+
job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
|
|
59
|
+
skip_finished=True,
|
|
60
|
+
fields=managed_jobsv1_pb2.Fields(
|
|
61
|
+
fields=_MANAGED_JOB_FIELDS_TO_GET),
|
|
62
|
+
)
|
|
54
63
|
job_table_response = backend_utils.invoke_skylet_with_retries(
|
|
55
64
|
lambda: cloud_vm_ray_backend.SkyletClient(
|
|
56
65
|
handle.get_grpc_channel()).get_managed_job_table(
|
|
@@ -62,7 +71,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
62
71
|
|
|
63
72
|
if use_legacy:
|
|
64
73
|
# Get controller version and raw job table
|
|
65
|
-
code = managed_job_utils.ManagedJobCodeGen.
|
|
74
|
+
code = managed_job_utils.ManagedJobCodeGen.get_version()
|
|
66
75
|
|
|
67
76
|
returncode, output, stderr = backend.run_on_head(handle,
|
|
68
77
|
code,
|
|
@@ -72,7 +81,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
72
81
|
|
|
73
82
|
if returncode != 0:
|
|
74
83
|
logger.error(output + stderr)
|
|
75
|
-
raise ValueError('Failed to check controller version
|
|
84
|
+
raise ValueError('Failed to check controller version with '
|
|
76
85
|
f'returncode: {returncode}.\n{output + stderr}')
|
|
77
86
|
|
|
78
87
|
# Parse the output to extract controller version (split only on first
|
|
@@ -80,19 +89,28 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
|
|
|
80
89
|
output_parts = output.strip().split('\n', 1)
|
|
81
90
|
|
|
82
91
|
# Extract controller version from first line
|
|
83
|
-
if
|
|
84
|
-
'controller_version:'):
|
|
92
|
+
if not output_parts[0].startswith('controller_version:'):
|
|
85
93
|
raise ValueError(
|
|
86
94
|
f'Expected controller version in first line, got: {output}')
|
|
87
95
|
|
|
88
96
|
controller_version = output_parts[0].split(':', 1)[1]
|
|
89
97
|
|
|
90
|
-
|
|
91
|
-
|
|
98
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
99
|
+
skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
|
|
100
|
+
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
101
|
+
handle,
|
|
102
|
+
code,
|
|
103
|
+
require_outputs=True,
|
|
104
|
+
stream_logs=False,
|
|
105
|
+
separate_stderr=True)
|
|
106
|
+
|
|
107
|
+
if returncode != 0:
|
|
108
|
+
logger.error(job_table_payload + stderr)
|
|
109
|
+
raise ValueError('Failed to fetch managed jobs with returncode: '
|
|
110
|
+
f'{returncode}.\n{job_table_payload + stderr}')
|
|
92
111
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
job_table_payload)
|
|
112
|
+
jobs, _, _, _, _ = (
|
|
113
|
+
managed_job_utils.load_managed_job_queue(job_table_payload))
|
|
96
114
|
|
|
97
115
|
# Process locally: check version match and filter non-terminal jobs
|
|
98
116
|
version_matches = (controller_version == local_version or
|