skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250913__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +148 -30
- sky/backends/cloud_vm_ray_backend.py +606 -223
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -37
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-d1e29b3aa66bf4cf.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +75 -26
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +67 -24
- sky/logs/agent.py +10 -2
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +14 -8
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +14 -3
- sky/skylet/log_lib.pyi +9 -0
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/controller_utils.py +11 -5
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/RECORD +96 -85
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → Y0Q7LyrxiFoWWbTdwb5nh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250913.dist-info}/top_level.txt +0 -0
sky/server/requests/executor.py
CHANGED
|
@@ -31,6 +31,7 @@ import time
|
|
|
31
31
|
import typing
|
|
32
32
|
from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
|
|
33
33
|
|
|
34
|
+
import psutil
|
|
34
35
|
import setproctitle
|
|
35
36
|
|
|
36
37
|
from sky import exceptions
|
|
@@ -130,8 +131,9 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
|
|
|
130
131
|
def executor_initializer(proc_group: str):
|
|
131
132
|
setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
|
|
132
133
|
f'{multiprocessing.current_process().pid}')
|
|
134
|
+
# Executor never stops, unless the whole process is killed.
|
|
133
135
|
threading.Thread(target=metrics_lib.process_monitor,
|
|
134
|
-
args=(f'worker:{proc_group}',),
|
|
136
|
+
args=(f'worker:{proc_group}', threading.Event()),
|
|
135
137
|
daemon=True).start()
|
|
136
138
|
|
|
137
139
|
|
|
@@ -373,11 +375,13 @@ def _request_execution_wrapper(request_id: str,
|
|
|
373
375
|
4. Handle the SIGTERM signal to abort the request gracefully.
|
|
374
376
|
5. Maintain the lifecycle of the temp dir used by the request.
|
|
375
377
|
"""
|
|
378
|
+
pid = multiprocessing.current_process().pid
|
|
379
|
+
proc = psutil.Process(pid)
|
|
380
|
+
rss_begin = proc.memory_info().rss
|
|
376
381
|
db_utils.set_max_connections(num_db_connections_per_worker)
|
|
377
382
|
# Handle the SIGTERM signal to abort the request processing gracefully.
|
|
378
383
|
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
379
384
|
|
|
380
|
-
pid = multiprocessing.current_process().pid
|
|
381
385
|
logger.info(f'Running request {request_id} with pid {pid}')
|
|
382
386
|
with api_requests.update_request(request_id) as request_task:
|
|
383
387
|
assert request_task is not None, request_id
|
|
@@ -443,8 +447,41 @@ def _request_execution_wrapper(request_id: str,
|
|
|
443
447
|
_restore_output(original_stdout, original_stderr)
|
|
444
448
|
logger.info(f'Request {request_id} finished')
|
|
445
449
|
finally:
|
|
446
|
-
|
|
447
|
-
|
|
450
|
+
try:
|
|
451
|
+
# Capture the peak RSS before GC.
|
|
452
|
+
peak_rss = max(proc.memory_info().rss,
|
|
453
|
+
metrics_lib.peak_rss_bytes)
|
|
454
|
+
with metrics_lib.time_it(name='release_memory',
|
|
455
|
+
group='internal'):
|
|
456
|
+
common_utils.release_memory()
|
|
457
|
+
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
458
|
+
except Exception as e: # pylint: disable=broad-except
|
|
459
|
+
logger.error(f'Failed to record memory metrics: '
|
|
460
|
+
f'{common_utils.format_exception(e)}')
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
_first_request = True
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _record_memory_metrics(request_name: str, proc: psutil.Process,
|
|
467
|
+
rss_begin: int, peak_rss: int) -> None:
|
|
468
|
+
"""Record the memory metrics for a request."""
|
|
469
|
+
# Do not record full memory delta for the first request as it
|
|
470
|
+
# will loads the sky core modules and make the memory usage
|
|
471
|
+
# estimation inaccurate.
|
|
472
|
+
global _first_request
|
|
473
|
+
if _first_request:
|
|
474
|
+
_first_request = False
|
|
475
|
+
return
|
|
476
|
+
rss_end = proc.memory_info().rss
|
|
477
|
+
|
|
478
|
+
# Answer "how much RSS this request contributed?"
|
|
479
|
+
metrics_lib.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
|
|
480
|
+
name=request_name).observe(max(rss_end - rss_begin, 0))
|
|
481
|
+
# Estimate the memory usage by the request by capturing the
|
|
482
|
+
# peak memory delta during the request execution.
|
|
483
|
+
metrics_lib.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
|
|
484
|
+
name=request_name).observe(max(peak_rss - rss_begin, 0))
|
|
448
485
|
|
|
449
486
|
|
|
450
487
|
async def execute_request_coroutine(request: api_requests.Request):
|
sky/server/server.py
CHANGED
|
@@ -1214,6 +1214,7 @@ async def logs(
|
|
|
1214
1214
|
request_body=cluster_job_body,
|
|
1215
1215
|
func=core.tail_logs,
|
|
1216
1216
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1217
|
+
request_cluster_name=cluster_job_body.cluster_name,
|
|
1217
1218
|
)
|
|
1218
1219
|
task = asyncio.create_task(executor.execute_request_coroutine(request_task))
|
|
1219
1220
|
|
sky/server/uvicorn.py
CHANGED
|
@@ -213,11 +213,17 @@ class Server(uvicorn.Server):
|
|
|
213
213
|
# Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
|
|
214
214
|
event_loop.set_debug(True)
|
|
215
215
|
event_loop.slow_callback_duration = lag_threshold
|
|
216
|
-
threading.
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
216
|
+
stop_monitor = threading.Event()
|
|
217
|
+
monitor = threading.Thread(target=metrics_lib.process_monitor,
|
|
218
|
+
args=('server', stop_monitor),
|
|
219
|
+
daemon=True)
|
|
220
|
+
monitor.start()
|
|
221
|
+
try:
|
|
222
|
+
with self.capture_signals():
|
|
223
|
+
asyncio.run(self.serve(*args, **kwargs))
|
|
224
|
+
finally:
|
|
225
|
+
stop_monitor.set()
|
|
226
|
+
monitor.join()
|
|
221
227
|
|
|
222
228
|
|
|
223
229
|
def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -110,7 +110,8 @@ server_dependencies = [
|
|
|
110
110
|
local_ray = [
|
|
111
111
|
# Lower version of ray will cause dependency conflict for
|
|
112
112
|
# click/grpcio/protobuf.
|
|
113
|
-
# Ray 2.6.1+ resolved cluster launcher bugs
|
|
113
|
+
# Ray 2.6.1+ resolved cluster launcher bugs
|
|
114
|
+
# and grpcio issues on Apple Silicon.
|
|
114
115
|
# https://github.com/ray-project/ray/releases/tag/ray-2.6.1
|
|
115
116
|
'ray[default] >= 2.6.1',
|
|
116
117
|
]
|
|
@@ -200,9 +201,14 @@ extras_require: Dict[str, List[str]] = {
|
|
|
200
201
|
# 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
|
|
201
202
|
],
|
|
202
203
|
'nebius': [
|
|
204
|
+
# Nebius requires grpcio and protobuf, so we need to include
|
|
205
|
+
# our constraints here.
|
|
203
206
|
'nebius>=0.2.47',
|
|
207
|
+
GRPC,
|
|
208
|
+
PROTOBUF,
|
|
204
209
|
] + aws_dependencies,
|
|
205
210
|
'hyperbolic': [], # No dependencies needed for hyperbolic
|
|
211
|
+
'seeweb': ['ecsapi>=0.2.0'],
|
|
206
212
|
'server': server_dependencies,
|
|
207
213
|
}
|
|
208
214
|
|
|
@@ -213,6 +219,7 @@ clouds_for_all.remove('remote')
|
|
|
213
219
|
if sys.version_info < (3, 10):
|
|
214
220
|
# Nebius needs python3.10. If python 3.9 [all] will not install nebius
|
|
215
221
|
clouds_for_all.remove('nebius')
|
|
222
|
+
clouds_for_all.remove('seeweb')
|
|
216
223
|
|
|
217
224
|
if sys.version_info >= (3, 12):
|
|
218
225
|
# The version of ray we use does not work with >= 3.12, so avoid clouds
|
sky/skylet/constants.py
CHANGED
|
@@ -62,11 +62,14 @@ SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
|
|
62
62
|
'curl -LsSf https://astral.sh/uv/install.sh '
|
|
63
63
|
f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
|
|
64
64
|
SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
|
|
65
|
-
|
|
66
|
-
#
|
|
65
|
+
SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run')
|
|
66
|
+
# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
|
|
67
|
+
# VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
|
|
68
|
+
# not work when conda is used.
|
|
67
69
|
DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
|
68
70
|
'export PATH='
|
|
69
|
-
f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")'
|
|
71
|
+
f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||") && '
|
|
72
|
+
'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
|
|
70
73
|
|
|
71
74
|
# Prefix for SkyPilot environment variables
|
|
72
75
|
SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
|
|
@@ -98,7 +101,7 @@ SKYLET_VERSION = '18'
|
|
|
98
101
|
SKYLET_LIB_VERSION = 4
|
|
99
102
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
|
100
103
|
SKYLET_GRPC_PORT = 46590
|
|
101
|
-
SKYLET_GRPC_TIMEOUT_SECONDS =
|
|
104
|
+
SKYLET_GRPC_TIMEOUT_SECONDS = 10
|
|
102
105
|
|
|
103
106
|
# Docker default options
|
|
104
107
|
DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
|
|
@@ -229,7 +232,7 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
229
232
|
'export PATH=$PATH:$HOME/.local/bin; '
|
|
230
233
|
# Writes ray path to file if it does not exist or the file is empty.
|
|
231
234
|
f'[ -s {SKY_RAY_PATH_FILE} ] || '
|
|
232
|
-
f'{{ {
|
|
235
|
+
f'{{ {SKY_UV_RUN_CMD} '
|
|
233
236
|
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
|
|
234
237
|
|
|
235
238
|
SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
|
|
@@ -374,7 +377,6 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
374
377
|
('ssh', 'pod_config'),
|
|
375
378
|
('kubernetes', 'custom_metadata'),
|
|
376
379
|
('kubernetes', 'pod_config'),
|
|
377
|
-
('kubernetes', 'context_configs'),
|
|
378
380
|
('kubernetes', 'provision_timeout'),
|
|
379
381
|
('kubernetes', 'dws'),
|
|
380
382
|
('kubernetes', 'kueue'),
|
|
@@ -382,6 +384,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
382
384
|
('gcp', 'enable_gvnic'),
|
|
383
385
|
('gcp', 'enable_gpu_direct'),
|
|
384
386
|
('gcp', 'placement_policy'),
|
|
387
|
+
('active_workspace',),
|
|
385
388
|
]
|
|
386
389
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
387
390
|
# we skip the following keys because they are meant to be client-side configs.
|
|
@@ -449,11 +452,11 @@ SKYPILOT_DEFAULT_WORKSPACE = 'default'
|
|
|
449
452
|
# BEGIN constants used for service catalog.
|
|
450
453
|
HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
|
|
451
454
|
HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
|
|
452
|
-
CATALOG_SCHEMA_VERSION = '
|
|
455
|
+
CATALOG_SCHEMA_VERSION = 'v8'
|
|
453
456
|
CATALOG_DIR = '~/.sky/catalogs'
|
|
454
457
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
455
458
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
|
456
|
-
'paperspace', 'do', 'nebius', 'ssh', 'hyperbolic')
|
|
459
|
+
'paperspace', 'do', 'nebius', 'ssh', 'hyperbolic', 'seeweb')
|
|
457
460
|
# END constants used for service catalog.
|
|
458
461
|
|
|
459
462
|
# The user ID of the SkyPilot system.
|
|
@@ -510,3 +513,6 @@ SKY_LOCKS_DIR = os.path.expanduser('~/.sky/locks')
|
|
|
510
513
|
|
|
511
514
|
ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
|
|
512
515
|
'DEBUG_LOOP_LAG_THRESHOLD_MS')
|
|
516
|
+
|
|
517
|
+
ARM64_ARCH = 'arm64'
|
|
518
|
+
X86_64_ARCH = 'x86_64'
|
sky/skylet/job_lib.py
CHANGED
|
@@ -31,8 +31,11 @@ from sky.utils.db import db_utils
|
|
|
31
31
|
|
|
32
32
|
if typing.TYPE_CHECKING:
|
|
33
33
|
import psutil
|
|
34
|
+
|
|
35
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
34
36
|
else:
|
|
35
37
|
psutil = adaptors_common.LazyImport('psutil')
|
|
38
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
36
39
|
|
|
37
40
|
logger = sky_logging.init_logger(__name__)
|
|
38
41
|
|
|
@@ -220,6 +223,45 @@ class JobStatus(enum.Enum):
|
|
|
220
223
|
color = _JOB_STATUS_TO_COLOR[self]
|
|
221
224
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
|
222
225
|
|
|
226
|
+
@classmethod
|
|
227
|
+
def from_protobuf(
|
|
228
|
+
cls,
|
|
229
|
+
protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
|
|
230
|
+
"""Convert protobuf JobStatus enum to Python enum value."""
|
|
231
|
+
protobuf_to_enum = {
|
|
232
|
+
jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
|
|
233
|
+
jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
|
|
234
|
+
jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
|
|
235
|
+
jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
|
|
236
|
+
jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
|
|
237
|
+
jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
|
|
238
|
+
jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
|
|
239
|
+
jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
|
|
240
|
+
jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
|
|
241
|
+
jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
|
|
242
|
+
}
|
|
243
|
+
if protobuf_value not in protobuf_to_enum:
|
|
244
|
+
raise ValueError(
|
|
245
|
+
f'Unknown protobuf JobStatus value: {protobuf_value}')
|
|
246
|
+
return protobuf_to_enum[protobuf_value]
|
|
247
|
+
|
|
248
|
+
def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
|
|
249
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
250
|
+
enum_to_protobuf = {
|
|
251
|
+
JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
|
|
252
|
+
JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
|
|
253
|
+
JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
|
|
254
|
+
JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
|
|
255
|
+
JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
|
|
256
|
+
JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
|
|
257
|
+
JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
|
|
258
|
+
JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
|
|
259
|
+
JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
|
|
260
|
+
}
|
|
261
|
+
if self not in enum_to_protobuf:
|
|
262
|
+
raise ValueError(f'Unknown JobStatus value: {self}')
|
|
263
|
+
return enum_to_protobuf[self]
|
|
264
|
+
|
|
223
265
|
|
|
224
266
|
# We have two steps for job submissions:
|
|
225
267
|
# 1. Client reserve a job id from the job table by adding a INIT state job.
|
|
@@ -475,6 +517,11 @@ def get_status(job_id: int) -> Optional[JobStatus]:
|
|
|
475
517
|
|
|
476
518
|
@init_db
|
|
477
519
|
def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
520
|
+
return message_utils.encode_payload(get_statuses(job_ids))
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
@init_db
|
|
524
|
+
def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
|
|
478
525
|
assert _DB is not None
|
|
479
526
|
# Per-job lock is not required here, since the staled job status will not
|
|
480
527
|
# affect the caller.
|
|
@@ -482,10 +529,52 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
|
482
529
|
rows = _DB.cursor.execute(
|
|
483
530
|
f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
|
|
484
531
|
job_ids)
|
|
485
|
-
statuses = {job_id: None for job_id in job_ids}
|
|
532
|
+
statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
|
|
486
533
|
for (job_id, status) in rows:
|
|
487
534
|
statuses[job_id] = status
|
|
488
|
-
return
|
|
535
|
+
return statuses
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
@init_db
|
|
539
|
+
def get_jobs_info(user_hash: Optional[str] = None,
|
|
540
|
+
all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
|
|
541
|
+
"""Get detailed job information.
|
|
542
|
+
|
|
543
|
+
Similar to dump_job_queue but returns structured protobuf objects instead
|
|
544
|
+
of encoded strings.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
user_hash: The user hash to show jobs for. Show all the users if None.
|
|
548
|
+
all_jobs: Whether to show all jobs, not just the pending/running ones.
|
|
549
|
+
"""
|
|
550
|
+
assert _DB is not None
|
|
551
|
+
|
|
552
|
+
status_list: Optional[List[JobStatus]] = [
|
|
553
|
+
JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
|
|
554
|
+
]
|
|
555
|
+
if all_jobs:
|
|
556
|
+
status_list = None
|
|
557
|
+
|
|
558
|
+
jobs = _get_jobs(user_hash, status_list=status_list)
|
|
559
|
+
jobs_info = []
|
|
560
|
+
for job in jobs:
|
|
561
|
+
jobs_info.append(
|
|
562
|
+
jobsv1_pb2.JobInfo(
|
|
563
|
+
job_id=job['job_id'],
|
|
564
|
+
job_name=job['job_name'],
|
|
565
|
+
username=job['username'],
|
|
566
|
+
submitted_at=job['submitted_at'],
|
|
567
|
+
status=job['status'].to_protobuf(),
|
|
568
|
+
run_timestamp=job['run_timestamp'],
|
|
569
|
+
start_at=job['start_at']
|
|
570
|
+
if job['start_at'] is not None else -1.0,
|
|
571
|
+
end_at=job['end_at'] if job['end_at'] is not None else 0.0,
|
|
572
|
+
resources=job['resources'] or '',
|
|
573
|
+
pid=job['pid'],
|
|
574
|
+
log_path=os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
575
|
+
job['run_timestamp']),
|
|
576
|
+
metadata=json.dumps(job['metadata'])))
|
|
577
|
+
return jobs_info
|
|
489
578
|
|
|
490
579
|
|
|
491
580
|
def load_statuses_payload(
|
|
@@ -527,13 +616,24 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
|
527
616
|
`format_job_queue()`), because the job may stay in PENDING if the cluster is
|
|
528
617
|
busy.
|
|
529
618
|
"""
|
|
619
|
+
return message_utils.encode_payload(
|
|
620
|
+
get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
@init_db
|
|
624
|
+
def get_job_submitted_or_ended_timestamp(
|
|
625
|
+
job_id: int, get_ended_time: bool) -> Optional[float]:
|
|
626
|
+
"""Get the job submitted timestamp.
|
|
627
|
+
|
|
628
|
+
Returns the raw timestamp or None if job doesn't exist.
|
|
629
|
+
"""
|
|
530
630
|
assert _DB is not None
|
|
531
631
|
field = 'end_at' if get_ended_time else 'submitted_at'
|
|
532
632
|
rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
|
|
533
633
|
(job_id,))
|
|
534
634
|
for (timestamp,) in rows:
|
|
535
|
-
return
|
|
536
|
-
return
|
|
635
|
+
return timestamp
|
|
636
|
+
return None
|
|
537
637
|
|
|
538
638
|
|
|
539
639
|
def get_ray_port():
|
|
@@ -947,6 +1047,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
947
1047
|
Encoded job IDs that are actually cancelled. Caller should use
|
|
948
1048
|
message_utils.decode_payload() to parse.
|
|
949
1049
|
"""
|
|
1050
|
+
return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
|
|
1051
|
+
user_hash))
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
def cancel_jobs(jobs: Optional[List[int]],
|
|
1055
|
+
cancel_all: bool = False,
|
|
1056
|
+
user_hash: Optional[str] = None) -> List[int]:
|
|
950
1057
|
job_records = []
|
|
951
1058
|
all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
|
|
952
1059
|
if jobs is None and not cancel_all:
|
|
@@ -1010,7 +1117,7 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
1010
1117
|
cancelled_ids.append(job['job_id'])
|
|
1011
1118
|
|
|
1012
1119
|
scheduler.schedule_step()
|
|
1013
|
-
return
|
|
1120
|
+
return cancelled_ids
|
|
1014
1121
|
|
|
1015
1122
|
|
|
1016
1123
|
@init_db
|
|
@@ -1030,6 +1137,17 @@ def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
|
|
|
1030
1137
|
|
|
1031
1138
|
@init_db
|
|
1032
1139
|
def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
1140
|
+
"""Returns the relative paths to the log files for jobs with globbing,
|
|
1141
|
+
encoded."""
|
|
1142
|
+
job_to_dir = get_job_log_dirs(job_ids)
|
|
1143
|
+
job_to_dir_str: Dict[str, str] = {}
|
|
1144
|
+
for job_id, log_dir in job_to_dir.items():
|
|
1145
|
+
job_to_dir_str[str(job_id)] = log_dir
|
|
1146
|
+
return message_utils.encode_payload(job_to_dir_str)
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
@init_db
|
|
1150
|
+
def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
|
|
1033
1151
|
"""Returns the relative paths to the log files for jobs with globbing."""
|
|
1034
1152
|
assert _DB is not None
|
|
1035
1153
|
query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
|
|
@@ -1038,16 +1156,16 @@ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
|
1038
1156
|
SELECT * FROM jobs
|
|
1039
1157
|
WHERE {query_str}""", job_ids)
|
|
1040
1158
|
rows = _DB.cursor.fetchall()
|
|
1041
|
-
job_to_dir = {}
|
|
1159
|
+
job_to_dir: Dict[int, str] = {}
|
|
1042
1160
|
for row in rows:
|
|
1043
1161
|
job_id = row[JobInfoLoc.JOB_ID.value]
|
|
1044
1162
|
if row[JobInfoLoc.LOG_PATH.value]:
|
|
1045
|
-
job_to_dir[
|
|
1163
|
+
job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
|
|
1046
1164
|
else:
|
|
1047
1165
|
run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
|
|
1048
|
-
job_to_dir[
|
|
1049
|
-
|
|
1050
|
-
return
|
|
1166
|
+
job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
1167
|
+
run_timestamp)
|
|
1168
|
+
return job_to_dir
|
|
1051
1169
|
|
|
1052
1170
|
|
|
1053
1171
|
class JobLibCodeGen:
|
sky/skylet/log_lib.py
CHANGED
|
@@ -354,6 +354,17 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
354
354
|
shell=True)
|
|
355
355
|
|
|
356
356
|
|
|
357
|
+
def run_bash_command_with_log_and_return_pid(
|
|
358
|
+
bash_command: str,
|
|
359
|
+
log_path: str,
|
|
360
|
+
env_vars: Optional[Dict[str, str]] = None,
|
|
361
|
+
stream_logs: bool = False,
|
|
362
|
+
with_ray: bool = False):
|
|
363
|
+
return_code = run_bash_command_with_log(bash_command, log_path, env_vars,
|
|
364
|
+
stream_logs, with_ray)
|
|
365
|
+
return {'return_code': return_code, 'pid': os.getpid()}
|
|
366
|
+
|
|
367
|
+
|
|
357
368
|
def _follow_job_logs(file,
|
|
358
369
|
job_id: int,
|
|
359
370
|
start_streaming: bool,
|
|
@@ -395,9 +406,9 @@ def _follow_job_logs(file,
|
|
|
395
406
|
wait_last_logs = False
|
|
396
407
|
continue
|
|
397
408
|
status_str = status.value if status is not None else 'None'
|
|
398
|
-
|
|
399
|
-
f'Job finished (status: {status_str}).')
|
|
400
|
-
|
|
409
|
+
finish = ux_utils.finishing_message(
|
|
410
|
+
f'Job finished (status: {status_str}).')
|
|
411
|
+
yield finish + '\n'
|
|
401
412
|
return
|
|
402
413
|
|
|
403
414
|
time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
|
sky/skylet/log_lib.pyi
CHANGED
|
@@ -129,6 +129,15 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
129
129
|
...
|
|
130
130
|
|
|
131
131
|
|
|
132
|
+
def run_bash_command_with_log_and_return_pid(
|
|
133
|
+
bash_command: str,
|
|
134
|
+
log_path: str,
|
|
135
|
+
env_vars: Optional[Dict[str, str]] = ...,
|
|
136
|
+
stream_logs: bool = ...,
|
|
137
|
+
with_ray: bool = ...):
|
|
138
|
+
...
|
|
139
|
+
|
|
140
|
+
|
|
132
141
|
def tail_logs(job_id: int,
|
|
133
142
|
log_dir: Optional[str],
|
|
134
143
|
managed_job_id: Optional[int] = ...,
|