skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/backends/backend_utils.py +9 -6
- sky/backends/cloud_vm_ray_backend.py +2 -3
- sky/check.py +25 -13
- sky/client/cli/command.py +52 -24
- sky/cloud_stores.py +73 -0
- sky/clouds/aws.py +59 -11
- sky/core.py +7 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +71 -2
- sky/data/storage.py +166 -9
- sky/global_user_state.py +14 -18
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +62 -67
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/scheduler.py +15 -2
- sky/jobs/server/core.py +85 -13
- sky/jobs/server/server.py +14 -13
- sky/jobs/server/utils.py +28 -10
- sky/jobs/state.py +216 -40
- sky/jobs/utils.py +65 -28
- sky/metrics/utils.py +18 -0
- sky/optimizer.py +1 -1
- sky/provision/kubernetes/instance.py +88 -19
- sky/provision/kubernetes/volume.py +2 -2
- sky/schemas/api/responses.py +3 -5
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/replica_managers.py +2 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/server.py +8 -7
- sky/server/common.py +21 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +23 -17
- sky/server/requests/executor.py +7 -3
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/request_names.py +80 -0
- sky/server/requests/requests.py +137 -102
- sky/server/requests/serializers/decoders.py +0 -6
- sky/server/requests/serializers/encoders.py +33 -6
- sky/server/server.py +105 -36
- sky/server/stream_utils.py +56 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +6 -1
- sky/skylet/events.py +7 -0
- sky/skylet/services.py +18 -7
- sky/ssh_node_pools/server.py +5 -4
- sky/task.py +14 -42
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +140 -12
- sky/users/permission.py +4 -1
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/context_utils.py +13 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/volumes/server/server.py +4 -3
- sky/workspaces/server.py +7 -6
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
- sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
- sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
- /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
|
@@ -33,6 +33,7 @@ from sky.utils.db import db_utils
|
|
|
33
33
|
POLL_INTERVAL = 2
|
|
34
34
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
|
35
35
|
_MAX_RETRIES = 3
|
|
36
|
+
_MAX_MISSING_PODS_RETRIES = 5
|
|
36
37
|
_NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
|
|
37
38
|
|
|
38
39
|
# Pattern to extract SSH user from command output, handling MOTD contamination
|
|
@@ -489,17 +490,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
|
|
|
489
490
|
|
|
490
491
|
|
|
491
492
|
@timeline.event
|
|
492
|
-
def _wait_for_pods_to_run(namespace, context,
|
|
493
|
+
def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
|
|
493
494
|
"""Wait for pods and their containers to be ready.
|
|
494
495
|
|
|
495
496
|
Pods may be pulling images or may be in the process of container
|
|
496
497
|
creation.
|
|
497
498
|
"""
|
|
498
|
-
if not
|
|
499
|
+
if not new_pods:
|
|
499
500
|
return
|
|
500
501
|
|
|
501
502
|
# Create a set of pod names we're waiting for
|
|
502
|
-
expected_pod_names = {
|
|
503
|
+
expected_pod_names = {pod.metadata.name for pod in new_pods}
|
|
503
504
|
|
|
504
505
|
def _check_init_containers(pod):
|
|
505
506
|
# Check if any of the init containers failed
|
|
@@ -526,28 +527,62 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
526
527
|
'Failed to create init container for pod '
|
|
527
528
|
f'{pod.metadata.name}. Error details: {msg}.')
|
|
528
529
|
|
|
530
|
+
missing_pods_retry = 0
|
|
529
531
|
while True:
|
|
530
532
|
# Get all pods in a single API call
|
|
531
|
-
|
|
533
|
+
cluster_name_on_cloud = new_pods[0].metadata.labels[
|
|
532
534
|
k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
533
535
|
all_pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
534
536
|
namespace,
|
|
535
537
|
label_selector=
|
|
536
|
-
f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={
|
|
538
|
+
f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
539
|
+
).items
|
|
537
540
|
|
|
538
541
|
# Get the set of found pod names and check if we have all expected pods
|
|
539
542
|
found_pod_names = {pod.metadata.name for pod in all_pods}
|
|
540
|
-
|
|
541
|
-
if
|
|
543
|
+
missing_pod_names = expected_pod_names - found_pod_names
|
|
544
|
+
if missing_pod_names:
|
|
545
|
+
# In _wait_for_pods_to_schedule, we already wait for all pods to go
|
|
546
|
+
# from pending to scheduled. So if a pod is missing here, it means
|
|
547
|
+
# something unusual must have happened, and so should be treated as
|
|
548
|
+
# an exception.
|
|
549
|
+
# It is also only in _wait_for_pods_to_schedule that
|
|
550
|
+
# provision_timeout is used.
|
|
551
|
+
# TODO(kevin): Should we take provision_timeout into account here,
|
|
552
|
+
# instead of hardcoding the number of retries?
|
|
553
|
+
if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
|
|
554
|
+
for pod_name in missing_pod_names:
|
|
555
|
+
reason = _get_pod_missing_reason(context, namespace,
|
|
556
|
+
cluster_name, pod_name)
|
|
557
|
+
logger.warning(f'Pod {pod_name} missing: {reason}')
|
|
558
|
+
raise config_lib.KubernetesError(
|
|
559
|
+
f'Failed to get all pods after {missing_pods_retry} '
|
|
560
|
+
f'retries. Some pods may have been terminated or failed '
|
|
561
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
562
|
+
'for more details.')
|
|
542
563
|
logger.info('Retrying running pods check: '
|
|
543
|
-
f'Missing pods: {
|
|
564
|
+
f'Missing pods: {missing_pod_names}')
|
|
544
565
|
time.sleep(0.5)
|
|
566
|
+
missing_pods_retry += 1
|
|
545
567
|
continue
|
|
546
568
|
|
|
547
569
|
all_pods_running = True
|
|
548
570
|
for pod in all_pods:
|
|
549
571
|
if pod.metadata.name not in expected_pod_names:
|
|
550
572
|
continue
|
|
573
|
+
|
|
574
|
+
# Check if pod is terminated/preempted/failed.
|
|
575
|
+
if (pod.metadata.deletion_timestamp is not None or
|
|
576
|
+
pod.status.phase == 'Failed'):
|
|
577
|
+
# Get the reason and write to cluster events before
|
|
578
|
+
# the pod gets completely deleted from the API.
|
|
579
|
+
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
580
|
+
logger.warning(f'Pod {pod.metadata.name} terminated: {reason}')
|
|
581
|
+
raise config_lib.KubernetesError(
|
|
582
|
+
f'Pod {pod.metadata.name} has terminated or failed '
|
|
583
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
584
|
+
'for more details.')
|
|
585
|
+
|
|
551
586
|
# Continue if pod and all the containers within the
|
|
552
587
|
# pod are successfully created and running.
|
|
553
588
|
if pod.status.phase == 'Running' and all(
|
|
@@ -1169,7 +1204,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
|
1169
1204
|
# fail early if there is an error
|
|
1170
1205
|
logger.debug(f'run_instances: waiting for pods to be running (pulling '
|
|
1171
1206
|
f'images): {[pod.metadata.name for pod in pods]}')
|
|
1172
|
-
_wait_for_pods_to_run(namespace, context, pods)
|
|
1207
|
+
_wait_for_pods_to_run(namespace, context, cluster_name, pods)
|
|
1173
1208
|
logger.debug(f'run_instances: all pods are scheduled and running: '
|
|
1174
1209
|
f'{[pod.metadata.name for pod in pods]}')
|
|
1175
1210
|
|
|
@@ -1428,9 +1463,45 @@ def get_cluster_info(
|
|
|
1428
1463
|
|
|
1429
1464
|
|
|
1430
1465
|
def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
1431
|
-
"""Get pod termination reason and write to cluster events.
|
|
1432
|
-
|
|
1466
|
+
"""Get pod termination reason and write to cluster events.
|
|
1467
|
+
|
|
1468
|
+
Checks both pod conditions (for preemption/disruption) and
|
|
1469
|
+
container statuses (for exit codes/errors).
|
|
1470
|
+
"""
|
|
1433
1471
|
latest_timestamp = pod.status.start_time or datetime.datetime.min
|
|
1472
|
+
ready_state = 'Unknown'
|
|
1473
|
+
termination_reason = 'Terminated unexpectedly'
|
|
1474
|
+
container_reasons = []
|
|
1475
|
+
|
|
1476
|
+
# Check pod status conditions for high level overview.
|
|
1477
|
+
# No need to sort, as each condition.type will only appear once.
|
|
1478
|
+
for condition in pod.status.conditions:
|
|
1479
|
+
reason = condition.reason or 'Unknown reason'
|
|
1480
|
+
message = condition.message or ''
|
|
1481
|
+
|
|
1482
|
+
# Get last known readiness state.
|
|
1483
|
+
if condition.type == 'Ready':
|
|
1484
|
+
ready_state = f'{reason} ({message})' if message else reason
|
|
1485
|
+
# Kueue preemption, as defined in:
|
|
1486
|
+
# https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
|
|
1487
|
+
elif condition.type == 'TerminationTarget':
|
|
1488
|
+
termination_reason = f'Preempted by Kueue: {reason}'
|
|
1489
|
+
if message:
|
|
1490
|
+
termination_reason += f' ({message})'
|
|
1491
|
+
# Generic disruption.
|
|
1492
|
+
elif condition.type == 'DisruptionTarget':
|
|
1493
|
+
termination_reason = f'Disrupted: {reason}'
|
|
1494
|
+
if message:
|
|
1495
|
+
termination_reason += f' ({message})'
|
|
1496
|
+
|
|
1497
|
+
if condition.last_transition_time is not None:
|
|
1498
|
+
latest_timestamp = max(latest_timestamp,
|
|
1499
|
+
condition.last_transition_time)
|
|
1500
|
+
|
|
1501
|
+
pod_reason = (f'{termination_reason}.\n'
|
|
1502
|
+
f'Last known state: {ready_state}.')
|
|
1503
|
+
|
|
1504
|
+
# Check container statuses for exit codes/errors
|
|
1434
1505
|
if pod.status and pod.status.container_statuses:
|
|
1435
1506
|
for container_status in pod.status.container_statuses:
|
|
1436
1507
|
terminated = container_status.state.terminated
|
|
@@ -1445,18 +1516,15 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
|
1445
1516
|
if reason is None:
|
|
1446
1517
|
# just in-case reason is None, have default for debugging
|
|
1447
1518
|
reason = f'exit({exit_code})'
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
latest_timestamp = terminated.finished_at
|
|
1519
|
+
container_reasons.append(reason)
|
|
1520
|
+
latest_timestamp = max(latest_timestamp, terminated.finished_at)
|
|
1451
1521
|
|
|
1452
1522
|
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1453
1523
|
|
|
1454
|
-
if not reasons:
|
|
1455
|
-
return ''
|
|
1456
|
-
|
|
1457
1524
|
# Normally we will have a single container per pod for skypilot
|
|
1458
1525
|
# but doing this just in-case there are multiple containers.
|
|
1459
|
-
|
|
1526
|
+
if container_reasons:
|
|
1527
|
+
pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
|
|
1460
1528
|
|
|
1461
1529
|
global_user_state.add_cluster_event(
|
|
1462
1530
|
cluster_name,
|
|
@@ -1658,9 +1726,10 @@ def query_instances(
|
|
|
1658
1726
|
Optional[str]]] = {}
|
|
1659
1727
|
for pod in pods:
|
|
1660
1728
|
phase = pod.status.phase
|
|
1729
|
+
is_terminating = pod.metadata.deletion_timestamp is not None
|
|
1661
1730
|
pod_status = status_map[phase]
|
|
1662
1731
|
reason = None
|
|
1663
|
-
if phase in ('Failed', 'Unknown'):
|
|
1732
|
+
if phase in ('Failed', 'Unknown') or is_terminating:
|
|
1664
1733
|
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
1665
1734
|
logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
|
|
1666
1735
|
if non_terminated_only and pod_status is None:
|
|
@@ -75,7 +75,6 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
75
75
|
"""Deletes a volume."""
|
|
76
76
|
context, namespace = _get_context_namespace(config)
|
|
77
77
|
pvc_name = config.name_on_cloud
|
|
78
|
-
logger.info(f'Deleting PVC {pvc_name}')
|
|
79
78
|
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
80
79
|
delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
|
|
81
80
|
context).delete_namespaced_persistent_volume_claim(
|
|
@@ -84,6 +83,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
84
83
|
_request_timeout=config_lib.DELETION_TIMEOUT),
|
|
85
84
|
resource_type='pvc',
|
|
86
85
|
resource_name=pvc_name)
|
|
86
|
+
logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
|
|
87
87
|
return config
|
|
88
88
|
|
|
89
89
|
|
|
@@ -242,9 +242,9 @@ def create_persistent_volume_claim(namespace: str, context: Optional[str],
|
|
|
242
242
|
except kubernetes.api_exception() as e:
|
|
243
243
|
if e.status != 404: # Not found
|
|
244
244
|
raise
|
|
245
|
-
logger.info(f'Creating PVC {pvc_name}')
|
|
246
245
|
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
|
247
246
|
namespace=namespace, body=pvc_spec)
|
|
247
|
+
logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
|
|
248
248
|
|
|
249
249
|
|
|
250
250
|
def _get_pvc_spec(namespace: str,
|
sky/schemas/api/responses.py
CHANGED
|
@@ -79,6 +79,7 @@ class APIHealthResponse(ResponseBaseModel):
|
|
|
79
79
|
commit: str = ''
|
|
80
80
|
basic_auth_enabled: bool = False
|
|
81
81
|
user: Optional[models.User] = None
|
|
82
|
+
service_account_token_enabled: bool = False
|
|
82
83
|
|
|
83
84
|
|
|
84
85
|
class StatusResponse(ResponseBaseModel):
|
|
@@ -90,7 +91,7 @@ class StatusResponse(ResponseBaseModel):
|
|
|
90
91
|
# This is an internally facing field anyway, so it's less
|
|
91
92
|
# of a problem that it's not typed.
|
|
92
93
|
handle: Optional[Any] = None
|
|
93
|
-
last_use: str
|
|
94
|
+
last_use: Optional[str] = None
|
|
94
95
|
status: status_lib.ClusterStatus
|
|
95
96
|
autostop: int
|
|
96
97
|
to_down: bool
|
|
@@ -98,11 +99,8 @@ class StatusResponse(ResponseBaseModel):
|
|
|
98
99
|
# metadata is a JSON, so we use Any here.
|
|
99
100
|
metadata: Optional[Dict[str, Any]] = None
|
|
100
101
|
cluster_hash: str
|
|
101
|
-
# pydantic cannot generate the pydantic-core schema for
|
|
102
|
-
# storage_mounts_metadata, so we use Any here.
|
|
103
|
-
storage_mounts_metadata: Optional[Dict[str, Any]] = None
|
|
104
102
|
cluster_ever_up: bool
|
|
105
|
-
status_updated_at: int
|
|
103
|
+
status_updated_at: Optional[int] = None
|
|
106
104
|
user_hash: str
|
|
107
105
|
user_name: str
|
|
108
106
|
config_hash: Optional[str] = None
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Add columns for stored DAG/env file contents.
|
|
2
|
+
|
|
3
|
+
Revision ID: 004
|
|
4
|
+
Revises: 003
|
|
5
|
+
Create Date: 2025-10-27
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '004'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '003'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add columns to persist job file contents in the database."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
27
|
+
'dag_yaml_content',
|
|
28
|
+
sa.Text(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
31
|
+
'original_user_yaml_content',
|
|
32
|
+
sa.Text(),
|
|
33
|
+
server_default=None)
|
|
34
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
35
|
+
'env_file_content',
|
|
36
|
+
sa.Text(),
|
|
37
|
+
server_default=None)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def downgrade():
|
|
41
|
+
"""No downgrade logic."""
|
|
42
|
+
pass
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Adding columns for the GC time of task logs and controller logs.
|
|
2
|
+
|
|
3
|
+
Revision ID: 005
|
|
4
|
+
Revises: 004
|
|
5
|
+
Create Date: 2025-10-20
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '005'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '004'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add columns for logs gc."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
27
|
+
'controller_logs_cleaned_at',
|
|
28
|
+
sa.Float(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
db_utils.add_column_to_table_alembic('spot',
|
|
31
|
+
'logs_cleaned_at',
|
|
32
|
+
sa.Float(),
|
|
33
|
+
server_default=None)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def downgrade():
|
|
37
|
+
"""Remove columns for logs gc."""
|
|
38
|
+
pass
|
|
@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n*sky/schemas/generated/managed_jobsv1.proto\x12\x0fmanaged_jobs.v1\"\x15\n\x06JobIds\x12\x0b\n\x03ids\x18\x01 \x03(\x03\"\x1c\n\nUserHashes\x12\x0e\n\x06hashes\x18\x01 \x03(\t\"\x1c\n\x08Statuses\x12\x10\n\x08statuses\x18\x01 \x03(\t\"\x13\n\x11GetVersionRequest\"0\n\x12GetVersionResponse\x12\x1a\n\x12\x63ontroller_version\x18\x01 \x01(\t\"\
|
|
17
|
+
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n*sky/schemas/generated/managed_jobsv1.proto\x12\x0fmanaged_jobs.v1\"\x15\n\x06JobIds\x12\x0b\n\x03ids\x18\x01 \x03(\x03\"\x1c\n\nUserHashes\x12\x0e\n\x06hashes\x18\x01 \x03(\t\"\x1c\n\x08Statuses\x12\x10\n\x08statuses\x18\x01 \x03(\t\"\x18\n\x06\x46ields\x12\x0e\n\x06\x66ields\x18\x01 \x03(\t\" \n\nWorkspaces\x12\x12\n\nworkspaces\x18\x01 \x03(\t\"\x13\n\x11GetVersionRequest\"0\n\x12GetVersionResponse\x12\x1a\n\x12\x63ontroller_version\x18\x01 \x01(\t\"\xe1\x04\n\x12GetJobTableRequest\x12\x15\n\rskip_finished\x18\x01 \x01(\x08\x12?\n\x15\x61\x63\x63\x65ssible_workspaces\x18\x02 \x01(\x0b\x32\x1b.managed_jobs.v1.WorkspacesH\x00\x88\x01\x01\x12-\n\x07job_ids\x18\x03 \x01(\x0b\x32\x17.managed_jobs.v1.JobIdsH\x01\x88\x01\x01\x12\x1c\n\x0fworkspace_match\x18\x04 \x01(\tH\x02\x88\x01\x01\x12\x17\n\nname_match\x18\x05 \x01(\tH\x03\x88\x01\x01\x12\x17\n\npool_match\x18\x06 \x01(\tH\x04\x88\x01\x01\x12\x11\n\x04page\x18\x07 \x01(\x05H\x05\x88\x01\x01\x12\x12\n\x05limit\x18\x08 \x01(\x05H\x06\x88\x01\x01\x12\x35\n\x0buser_hashes\x18\t \x01(\x0b\x32\x1b.managed_jobs.v1.UserHashesH\x07\x88\x01\x01\x12\x30\n\x08statuses\x18\n \x01(\x0b\x32\x19.managed_jobs.v1.StatusesH\x08\x88\x01\x01\x12#\n\x1bshow_jobs_without_user_hash\x18\x0b \x01(\x08\x12,\n\x06\x66ields\x18\x0c \x01(\x0b\x32\x17.managed_jobs.v1.FieldsH\t\x88\x01\x01\x42\x18\n\x16_accessible_workspacesB\n\n\x08_job_idsB\x12\n\x10_workspace_matchB\r\n\x0b_name_matchB\r\n\x0b_pool_matchB\x07\n\x05_pageB\x08\n\x06_limitB\x0e\n\x0c_user_hashesB\x0b\n\t_statusesB\t\n\x07_fields\"\xcb\x08\n\x0eManagedJobInfo\x12\x0e\n\x06job_id\x18\x01 \x01(\x03\x12\x0f\n\x07task_id\x18\x02 \x01(\x03\x12\x10\n\x08job_name\x18\x03 \x01(\t\x12\x11\n\ttask_name\x18\x04 \x01(\t\x12\x14\n\x0cjob_duration\x18\x05 \x01(\x01\x12\x16\n\tworkspace\x18\x06 \x01(\tH\x00\x88\x01\x01\x12\x31\n\x06status\x18\x07 \x01(\x0e\x32!.managed_jobs.v1.ManagedJobStatus\x12@\n\x0eschedule_state\x18\x08 \x01(\x0e\x32(.managed_jobs.v1.ManagedJobScheduleState\x12\x11\n\tresources\x18\t \x01(\t\x12\x19\n\x11\x63luster_resources\x18\n \x01(\t\x12\x1e\n\x16\x63luster_resources_full\x18\x0b \x01(\t\x12\r\n\x05\x63loud\x18\x0c \x01(\t\x12\x0e\n\x06region\x18\r \x01(\t\x12\r\n\x05infra\x18\x0e \x01(\t\x12G\n\x0c\x61\x63\x63\x65lerators\x18\x0f \x03(\x0b\x32\x31.managed_jobs.v1.ManagedJobInfo.AcceleratorsEntry\x12\x16\n\x0erecovery_count\x18\x10 \x01(\x05\x12\x14\n\x07\x64\x65tails\x18\x11 \x01(\tH\x01\x88\x01\x01\x12\x1b\n\x0e\x66\x61ilure_reason\x18\x12 \x01(\tH\x02\x88\x01\x01\x12\x16\n\tuser_name\x18\x13 \x01(\tH\x03\x88\x01\x01\x12\x16\n\tuser_hash\x18\x14 \x01(\tH\x04\x88\x01\x01\x12\x19\n\x0csubmitted_at\x18\x15 \x01(\x01H\x05\x88\x01\x01\x12\x15\n\x08start_at\x18\x16 \x01(\x01H\x06\x88\x01\x01\x12\x13\n\x06\x65nd_at\x18\x17 \x01(\x01H\x07\x88\x01\x01\x12\x16\n\tuser_yaml\x18\x18 \x01(\tH\x08\x88\x01\x01\x12\x17\n\nentrypoint\x18\x19 \x01(\tH\t\x88\x01\x01\x12?\n\x08metadata\x18\x1a \x03(\x0b\x32-.managed_jobs.v1.ManagedJobInfo.MetadataEntry\x12\x11\n\x04pool\x18\x1b \x01(\tH\n\x88\x01\x01\x12\x16\n\tpool_hash\x18\x1c \x01(\tH\x0b\x88\x01\x01\x12\x14\n\x07_job_id\x18\x1d \x01(\x03H\x0c\x88\x01\x01\x1a\x33\n\x11\x41\x63\x63\x65leratorsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x02:\x02\x38\x01\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\n_workspaceB\n\n\x08_detailsB\x11\n\x0f_failure_reasonB\x0c\n\n_user_nameB\x0c\n\n_user_hashB\x0f\n\r_submitted_atB\x0b\n\t_start_atB\t\n\x07_end_atB\x0c\n\n_user_yamlB\r\n\x0b_entrypointB\x07\n\x05_poolB\x0c\n\n_pool_hashB\n\n\x08X_job_id\"\xf0\x01\n\x13GetJobTableResponse\x12-\n\x04jobs\x18\x01 \x03(\x0b\x32\x1f.managed_jobs.v1.ManagedJobInfo\x12\r\n\x05total\x18\x02 \x01(\x05\x12\x17\n\x0ftotal_no_filter\x18\x03 \x01(\x05\x12M\n\rstatus_counts\x18\x04 \x03(\x0b\x32\x36.managed_jobs.v1.GetJobTableResponse.StatusCountsEntry\x1a\x33\n\x11StatusCountsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x05:\x02\x38\x01\"?\n\x19GetAllJobIdsByNameRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0b\n\t_job_name\"-\n\x1aGetAllJobIdsByNameResponse\x12\x0f\n\x07job_ids\x18\x01 \x03(\x03\"\xd7\x01\n\x11\x43\x61ncelJobsRequest\x12\x19\n\x11\x63urrent_workspace\x18\x01 \x01(\t\x12\x16\n\tuser_hash\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x13\n\tall_users\x18\x03 \x01(\x08H\x00\x12*\n\x07job_ids\x18\x04 \x01(\x0b\x32\x17.managed_jobs.v1.JobIdsH\x00\x12\x12\n\x08job_name\x18\x05 \x01(\tH\x00\x12\x13\n\tpool_name\x18\x06 \x01(\tH\x00\x42\x17\n\x15\x63\x61ncellation_criteriaB\x0c\n\n_user_hash\"%\n\x12\x43\x61ncelJobsResponse\x12\x0f\n\x07message\x18\x01 \x01(\t\"\x97\x01\n\x11StreamLogsRequest\x12\x15\n\x08job_name\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x13\n\x06job_id\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x0e\n\x06\x66ollow\x18\x03 \x01(\x08\x12\x12\n\ncontroller\x18\x04 \x01(\x08\x12\x11\n\x04tail\x18\x05 \x01(\x05H\x02\x88\x01\x01\x42\x0b\n\t_job_nameB\t\n\x07_job_idB\x07\n\x05_tail\"L\n\x12StreamLogsResponse\x12\x10\n\x08log_line\x18\x01 \x01(\t\x12\x16\n\texit_code\x18\x02 \x01(\x05H\x00\x88\x01\x01\x42\x0c\n\n_exit_code*\x85\x04\n\x10ManagedJobStatus\x12\"\n\x1eMANAGED_JOB_STATUS_UNSPECIFIED\x10\x00\x12\x1e\n\x1aMANAGED_JOB_STATUS_PENDING\x10\x01\x12 \n\x1cMANAGED_JOB_STATUS_SUBMITTED\x10\x02\x12\x1f\n\x1bMANAGED_JOB_STATUS_STARTING\x10\x03\x12\x1e\n\x1aMANAGED_JOB_STATUS_RUNNING\x10\x04\x12!\n\x1dMANAGED_JOB_STATUS_RECOVERING\x10\x05\x12!\n\x1dMANAGED_JOB_STATUS_CANCELLING\x10\x06\x12 \n\x1cMANAGED_JOB_STATUS_SUCCEEDED\x10\x07\x12 \n\x1cMANAGED_JOB_STATUS_CANCELLED\x10\x08\x12\x1d\n\x19MANAGED_JOB_STATUS_FAILED\x10\t\x12#\n\x1fMANAGED_JOB_STATUS_FAILED_SETUP\x10\n\x12\'\n#MANAGED_JOB_STATUS_FAILED_PRECHECKS\x10\x0b\x12)\n%MANAGED_JOB_STATUS_FAILED_NO_RESOURCE\x10\x0c\x12(\n$MANAGED_JOB_STATUS_FAILED_CONTROLLER\x10\r*\x8f\x03\n\x17ManagedJobScheduleState\x12*\n&MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED\x10\x00\x12&\n\"MANAGED_JOB_SCHEDULE_STATE_INVALID\x10\x01\x12\'\n#MANAGED_JOB_SCHEDULE_STATE_INACTIVE\x10\x02\x12&\n\"MANAGED_JOB_SCHEDULE_STATE_WAITING\x10\x03\x12,\n(MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING\x10\x04\x12(\n$MANAGED_JOB_SCHEDULE_STATE_LAUNCHING\x10\x05\x12,\n(MANAGED_JOB_SCHEDULE_STATE_ALIVE_BACKOFF\x10\x06\x12$\n MANAGED_JOB_SCHEDULE_STATE_ALIVE\x10\x07\x12#\n\x1fMANAGED_JOB_SCHEDULE_STATE_DONE\x10\x08\x32\xe4\x03\n\x12ManagedJobsService\x12U\n\nGetVersion\x12\".managed_jobs.v1.GetVersionRequest\x1a#.managed_jobs.v1.GetVersionResponse\x12X\n\x0bGetJobTable\x12#.managed_jobs.v1.GetJobTableRequest\x1a$.managed_jobs.v1.GetJobTableResponse\x12m\n\x12GetAllJobIdsByName\x12*.managed_jobs.v1.GetAllJobIdsByNameRequest\x1a+.managed_jobs.v1.GetAllJobIdsByNameResponse\x12U\n\nCancelJobs\x12\".managed_jobs.v1.CancelJobsRequest\x1a#.managed_jobs.v1.CancelJobsResponse\x12W\n\nStreamLogs\x12\".managed_jobs.v1.StreamLogsRequest\x1a#.managed_jobs.v1.StreamLogsResponse0\x01\x62\x06proto3')
|
|
18
18
|
|
|
19
19
|
_globals = globals()
|
|
20
20
|
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
@@ -27,44 +27,48 @@ if not _descriptor._USE_C_DESCRIPTORS:
|
|
|
27
27
|
_globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_options = b'8\001'
|
|
28
28
|
_globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._loaded_options = None
|
|
29
29
|
_globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_options = b'8\001'
|
|
30
|
-
_globals['_MANAGEDJOBSTATUS']._serialized_start=
|
|
31
|
-
_globals['_MANAGEDJOBSTATUS']._serialized_end=
|
|
32
|
-
_globals['_MANAGEDJOBSCHEDULESTATE']._serialized_start=
|
|
33
|
-
_globals['_MANAGEDJOBSCHEDULESTATE']._serialized_end=
|
|
30
|
+
_globals['_MANAGEDJOBSTATUS']._serialized_start=2836
|
|
31
|
+
_globals['_MANAGEDJOBSTATUS']._serialized_end=3353
|
|
32
|
+
_globals['_MANAGEDJOBSCHEDULESTATE']._serialized_start=3356
|
|
33
|
+
_globals['_MANAGEDJOBSCHEDULESTATE']._serialized_end=3755
|
|
34
34
|
_globals['_JOBIDS']._serialized_start=63
|
|
35
35
|
_globals['_JOBIDS']._serialized_end=84
|
|
36
36
|
_globals['_USERHASHES']._serialized_start=86
|
|
37
37
|
_globals['_USERHASHES']._serialized_end=114
|
|
38
38
|
_globals['_STATUSES']._serialized_start=116
|
|
39
39
|
_globals['_STATUSES']._serialized_end=144
|
|
40
|
-
_globals['
|
|
41
|
-
_globals['
|
|
42
|
-
_globals['
|
|
43
|
-
_globals['
|
|
44
|
-
_globals['
|
|
45
|
-
_globals['
|
|
46
|
-
_globals['
|
|
47
|
-
_globals['
|
|
48
|
-
_globals['
|
|
49
|
-
_globals['
|
|
50
|
-
_globals['
|
|
51
|
-
_globals['
|
|
52
|
-
_globals['
|
|
53
|
-
_globals['
|
|
54
|
-
_globals['
|
|
55
|
-
_globals['
|
|
56
|
-
_globals['
|
|
57
|
-
_globals['
|
|
58
|
-
_globals['
|
|
59
|
-
_globals['
|
|
60
|
-
_globals['
|
|
61
|
-
_globals['
|
|
62
|
-
_globals['
|
|
63
|
-
_globals['
|
|
64
|
-
_globals['
|
|
65
|
-
_globals['
|
|
66
|
-
_globals['
|
|
67
|
-
_globals['
|
|
68
|
-
_globals['
|
|
69
|
-
_globals['
|
|
40
|
+
_globals['_FIELDS']._serialized_start=146
|
|
41
|
+
_globals['_FIELDS']._serialized_end=170
|
|
42
|
+
_globals['_WORKSPACES']._serialized_start=172
|
|
43
|
+
_globals['_WORKSPACES']._serialized_end=204
|
|
44
|
+
_globals['_GETVERSIONREQUEST']._serialized_start=206
|
|
45
|
+
_globals['_GETVERSIONREQUEST']._serialized_end=225
|
|
46
|
+
_globals['_GETVERSIONRESPONSE']._serialized_start=227
|
|
47
|
+
_globals['_GETVERSIONRESPONSE']._serialized_end=275
|
|
48
|
+
_globals['_GETJOBTABLEREQUEST']._serialized_start=278
|
|
49
|
+
_globals['_GETJOBTABLEREQUEST']._serialized_end=887
|
|
50
|
+
_globals['_MANAGEDJOBINFO']._serialized_start=890
|
|
51
|
+
_globals['_MANAGEDJOBINFO']._serialized_end=1989
|
|
52
|
+
_globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_start=1711
|
|
53
|
+
_globals['_MANAGEDJOBINFO_ACCELERATORSENTRY']._serialized_end=1762
|
|
54
|
+
_globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_start=1764
|
|
55
|
+
_globals['_MANAGEDJOBINFO_METADATAENTRY']._serialized_end=1811
|
|
56
|
+
_globals['_GETJOBTABLERESPONSE']._serialized_start=1992
|
|
57
|
+
_globals['_GETJOBTABLERESPONSE']._serialized_end=2232
|
|
58
|
+
_globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_start=2181
|
|
59
|
+
_globals['_GETJOBTABLERESPONSE_STATUSCOUNTSENTRY']._serialized_end=2232
|
|
60
|
+
_globals['_GETALLJOBIDSBYNAMEREQUEST']._serialized_start=2234
|
|
61
|
+
_globals['_GETALLJOBIDSBYNAMEREQUEST']._serialized_end=2297
|
|
62
|
+
_globals['_GETALLJOBIDSBYNAMERESPONSE']._serialized_start=2299
|
|
63
|
+
_globals['_GETALLJOBIDSBYNAMERESPONSE']._serialized_end=2344
|
|
64
|
+
_globals['_CANCELJOBSREQUEST']._serialized_start=2347
|
|
65
|
+
_globals['_CANCELJOBSREQUEST']._serialized_end=2562
|
|
66
|
+
_globals['_CANCELJOBSRESPONSE']._serialized_start=2564
|
|
67
|
+
_globals['_CANCELJOBSRESPONSE']._serialized_end=2601
|
|
68
|
+
_globals['_STREAMLOGSREQUEST']._serialized_start=2604
|
|
69
|
+
_globals['_STREAMLOGSREQUEST']._serialized_end=2755
|
|
70
|
+
_globals['_STREAMLOGSRESPONSE']._serialized_start=2757
|
|
71
|
+
_globals['_STREAMLOGSRESPONSE']._serialized_end=2833
|
|
72
|
+
_globals['_MANAGEDJOBSSERVICE']._serialized_start=3758
|
|
73
|
+
_globals['_MANAGEDJOBSSERVICE']._serialized_end=4242
|
|
70
74
|
# @@protoc_insertion_point(module_scope)
|
|
@@ -76,6 +76,18 @@ class Statuses(_message.Message):
|
|
|
76
76
|
statuses: _containers.RepeatedScalarFieldContainer[str]
|
|
77
77
|
def __init__(self, statuses: _Optional[_Iterable[str]] = ...) -> None: ...
|
|
78
78
|
|
|
79
|
+
class Fields(_message.Message):
|
|
80
|
+
__slots__ = ("fields",)
|
|
81
|
+
FIELDS_FIELD_NUMBER: _ClassVar[int]
|
|
82
|
+
fields: _containers.RepeatedScalarFieldContainer[str]
|
|
83
|
+
def __init__(self, fields: _Optional[_Iterable[str]] = ...) -> None: ...
|
|
84
|
+
|
|
85
|
+
class Workspaces(_message.Message):
|
|
86
|
+
__slots__ = ("workspaces",)
|
|
87
|
+
WORKSPACES_FIELD_NUMBER: _ClassVar[int]
|
|
88
|
+
workspaces: _containers.RepeatedScalarFieldContainer[str]
|
|
89
|
+
def __init__(self, workspaces: _Optional[_Iterable[str]] = ...) -> None: ...
|
|
90
|
+
|
|
79
91
|
class GetVersionRequest(_message.Message):
|
|
80
92
|
__slots__ = ()
|
|
81
93
|
def __init__(self) -> None: ...
|
|
@@ -87,7 +99,7 @@ class GetVersionResponse(_message.Message):
|
|
|
87
99
|
def __init__(self, controller_version: _Optional[str] = ...) -> None: ...
|
|
88
100
|
|
|
89
101
|
class GetJobTableRequest(_message.Message):
|
|
90
|
-
__slots__ = ("skip_finished", "accessible_workspaces", "job_ids", "workspace_match", "name_match", "pool_match", "page", "limit", "user_hashes", "statuses", "show_jobs_without_user_hash")
|
|
102
|
+
__slots__ = ("skip_finished", "accessible_workspaces", "job_ids", "workspace_match", "name_match", "pool_match", "page", "limit", "user_hashes", "statuses", "show_jobs_without_user_hash", "fields")
|
|
91
103
|
SKIP_FINISHED_FIELD_NUMBER: _ClassVar[int]
|
|
92
104
|
ACCESSIBLE_WORKSPACES_FIELD_NUMBER: _ClassVar[int]
|
|
93
105
|
JOB_IDS_FIELD_NUMBER: _ClassVar[int]
|
|
@@ -99,8 +111,9 @@ class GetJobTableRequest(_message.Message):
|
|
|
99
111
|
USER_HASHES_FIELD_NUMBER: _ClassVar[int]
|
|
100
112
|
STATUSES_FIELD_NUMBER: _ClassVar[int]
|
|
101
113
|
SHOW_JOBS_WITHOUT_USER_HASH_FIELD_NUMBER: _ClassVar[int]
|
|
114
|
+
FIELDS_FIELD_NUMBER: _ClassVar[int]
|
|
102
115
|
skip_finished: bool
|
|
103
|
-
accessible_workspaces:
|
|
116
|
+
accessible_workspaces: Workspaces
|
|
104
117
|
job_ids: JobIds
|
|
105
118
|
workspace_match: str
|
|
106
119
|
name_match: str
|
|
@@ -110,10 +123,11 @@ class GetJobTableRequest(_message.Message):
|
|
|
110
123
|
user_hashes: UserHashes
|
|
111
124
|
statuses: Statuses
|
|
112
125
|
show_jobs_without_user_hash: bool
|
|
113
|
-
|
|
126
|
+
fields: Fields
|
|
127
|
+
def __init__(self, skip_finished: bool = ..., accessible_workspaces: _Optional[_Union[Workspaces, _Mapping]] = ..., job_ids: _Optional[_Union[JobIds, _Mapping]] = ..., workspace_match: _Optional[str] = ..., name_match: _Optional[str] = ..., pool_match: _Optional[str] = ..., page: _Optional[int] = ..., limit: _Optional[int] = ..., user_hashes: _Optional[_Union[UserHashes, _Mapping]] = ..., statuses: _Optional[_Union[Statuses, _Mapping]] = ..., show_jobs_without_user_hash: bool = ..., fields: _Optional[_Union[Fields, _Mapping]] = ...) -> None: ...
|
|
114
128
|
|
|
115
129
|
class ManagedJobInfo(_message.Message):
|
|
116
|
-
__slots__ = ("job_id", "task_id", "job_name", "task_name", "job_duration", "workspace", "status", "schedule_state", "resources", "cluster_resources", "cluster_resources_full", "cloud", "region", "infra", "accelerators", "recovery_count", "details", "failure_reason", "user_name", "user_hash", "submitted_at", "start_at", "end_at", "user_yaml", "entrypoint", "metadata", "pool", "pool_hash")
|
|
130
|
+
__slots__ = ("job_id", "task_id", "job_name", "task_name", "job_duration", "workspace", "status", "schedule_state", "resources", "cluster_resources", "cluster_resources_full", "cloud", "region", "infra", "accelerators", "recovery_count", "details", "failure_reason", "user_name", "user_hash", "submitted_at", "start_at", "end_at", "user_yaml", "entrypoint", "metadata", "pool", "pool_hash", "_job_id")
|
|
117
131
|
class AcceleratorsEntry(_message.Message):
|
|
118
132
|
__slots__ = ("key", "value")
|
|
119
133
|
KEY_FIELD_NUMBER: _ClassVar[int]
|
|
@@ -156,6 +170,7 @@ class ManagedJobInfo(_message.Message):
|
|
|
156
170
|
METADATA_FIELD_NUMBER: _ClassVar[int]
|
|
157
171
|
POOL_FIELD_NUMBER: _ClassVar[int]
|
|
158
172
|
POOL_HASH_FIELD_NUMBER: _ClassVar[int]
|
|
173
|
+
_JOB_ID_FIELD_NUMBER: _ClassVar[int]
|
|
159
174
|
job_id: int
|
|
160
175
|
task_id: int
|
|
161
176
|
job_name: str
|
|
@@ -184,7 +199,8 @@ class ManagedJobInfo(_message.Message):
|
|
|
184
199
|
metadata: _containers.ScalarMap[str, str]
|
|
185
200
|
pool: str
|
|
186
201
|
pool_hash: str
|
|
187
|
-
|
|
202
|
+
_job_id: int
|
|
203
|
+
def __init__(self, job_id: _Optional[int] = ..., task_id: _Optional[int] = ..., job_name: _Optional[str] = ..., task_name: _Optional[str] = ..., job_duration: _Optional[float] = ..., workspace: _Optional[str] = ..., status: _Optional[_Union[ManagedJobStatus, str]] = ..., schedule_state: _Optional[_Union[ManagedJobScheduleState, str]] = ..., resources: _Optional[str] = ..., cluster_resources: _Optional[str] = ..., cluster_resources_full: _Optional[str] = ..., cloud: _Optional[str] = ..., region: _Optional[str] = ..., infra: _Optional[str] = ..., accelerators: _Optional[_Mapping[str, float]] = ..., recovery_count: _Optional[int] = ..., details: _Optional[str] = ..., failure_reason: _Optional[str] = ..., user_name: _Optional[str] = ..., user_hash: _Optional[str] = ..., submitted_at: _Optional[float] = ..., start_at: _Optional[float] = ..., end_at: _Optional[float] = ..., user_yaml: _Optional[str] = ..., entrypoint: _Optional[str] = ..., metadata: _Optional[_Mapping[str, str]] = ..., pool: _Optional[str] = ..., pool_hash: _Optional[str] = ..., _job_id: _Optional[int] = ...) -> None: ...
|
|
188
204
|
|
|
189
205
|
class GetJobTableResponse(_message.Message):
|
|
190
206
|
__slots__ = ("jobs", "total", "total_no_filter", "status_counts")
|
sky/serve/replica_managers.py
CHANGED
|
@@ -495,8 +495,8 @@ class ReplicaInfo:
|
|
|
495
495
|
info_dict['cloud'] = repr(handle.launched_resources.cloud)
|
|
496
496
|
info_dict['region'] = handle.launched_resources.region
|
|
497
497
|
info_dict['resources_str'] = (
|
|
498
|
-
resources_utils.get_readable_resources_repr(
|
|
499
|
-
|
|
498
|
+
resources_utils.get_readable_resources_repr(
|
|
499
|
+
handle, simplified_only=True)[0])
|
|
500
500
|
return info_dict
|
|
501
501
|
|
|
502
502
|
def __repr__(self) -> str:
|
sky/serve/serve_utils.py
CHANGED
|
@@ -1550,8 +1550,15 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
|
|
|
1550
1550
|
'handle']
|
|
1551
1551
|
if replica_handle is not None:
|
|
1552
1552
|
infra = replica_handle.launched_resources.infra.formatted_str()
|
|
1553
|
-
|
|
1554
|
-
|
|
1553
|
+
simplified = not show_all
|
|
1554
|
+
resources_str_simple, resources_str_full = (
|
|
1555
|
+
resources_utils.get_readable_resources_repr(
|
|
1556
|
+
replica_handle, simplified_only=simplified))
|
|
1557
|
+
if simplified:
|
|
1558
|
+
resources_str = resources_str_simple
|
|
1559
|
+
else:
|
|
1560
|
+
assert resources_str_full is not None
|
|
1561
|
+
resources_str = resources_str_full
|
|
1555
1562
|
|
|
1556
1563
|
replica_values = [
|
|
1557
1564
|
service_name,
|
sky/serve/server/server.py
CHANGED
|
@@ -10,6 +10,7 @@ from sky.server import common as server_common
|
|
|
10
10
|
from sky.server import stream_utils
|
|
11
11
|
from sky.server.requests import executor
|
|
12
12
|
from sky.server.requests import payloads
|
|
13
|
+
from sky.server.requests import request_names
|
|
13
14
|
from sky.server.requests import requests as api_requests
|
|
14
15
|
from sky.skylet import constants
|
|
15
16
|
from sky.utils import common
|
|
@@ -25,7 +26,7 @@ async def up(
|
|
|
25
26
|
) -> None:
|
|
26
27
|
await executor.schedule_request_async(
|
|
27
28
|
request_id=request.state.request_id,
|
|
28
|
-
request_name=
|
|
29
|
+
request_name=request_names.RequestName.SERVE_UP,
|
|
29
30
|
request_body=up_body,
|
|
30
31
|
func=core.up,
|
|
31
32
|
schedule_type=api_requests.ScheduleType.LONG,
|
|
@@ -40,7 +41,7 @@ async def update(
|
|
|
40
41
|
) -> None:
|
|
41
42
|
await executor.schedule_request_async(
|
|
42
43
|
request_id=request.state.request_id,
|
|
43
|
-
request_name=
|
|
44
|
+
request_name=request_names.RequestName.SERVE_UPDATE,
|
|
44
45
|
request_body=update_body,
|
|
45
46
|
func=core.update,
|
|
46
47
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -55,7 +56,7 @@ async def down(
|
|
|
55
56
|
) -> None:
|
|
56
57
|
await executor.schedule_request_async(
|
|
57
58
|
request_id=request.state.request_id,
|
|
58
|
-
request_name=
|
|
59
|
+
request_name=request_names.RequestName.SERVE_DOWN,
|
|
59
60
|
request_body=down_body,
|
|
60
61
|
func=core.down,
|
|
61
62
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -70,7 +71,7 @@ async def terminate_replica(
|
|
|
70
71
|
) -> None:
|
|
71
72
|
await executor.schedule_request_async(
|
|
72
73
|
request_id=request.state.request_id,
|
|
73
|
-
request_name=
|
|
74
|
+
request_name=request_names.RequestName.SERVE_TERMINATE_REPLICA,
|
|
74
75
|
request_body=terminate_replica_body,
|
|
75
76
|
func=core.terminate_replica,
|
|
76
77
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -85,7 +86,7 @@ async def status(
|
|
|
85
86
|
) -> None:
|
|
86
87
|
await executor.schedule_request_async(
|
|
87
88
|
request_id=request.state.request_id,
|
|
88
|
-
request_name=
|
|
89
|
+
request_name=request_names.RequestName.SERVE_STATUS,
|
|
89
90
|
request_body=status_body,
|
|
90
91
|
func=core.status,
|
|
91
92
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -101,7 +102,7 @@ async def tail_logs(
|
|
|
101
102
|
executor.check_request_thread_executor_available()
|
|
102
103
|
request_task = await executor.prepare_request_async(
|
|
103
104
|
request_id=request.state.request_id,
|
|
104
|
-
request_name=
|
|
105
|
+
request_name=request_names.RequestName.SERVE_LOGS,
|
|
105
106
|
request_body=log_body,
|
|
106
107
|
func=core.tail_logs,
|
|
107
108
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -134,7 +135,7 @@ async def download_logs(
|
|
|
134
135
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
135
136
|
await executor.schedule_request_async(
|
|
136
137
|
request_id=request.state.request_id,
|
|
137
|
-
request_name=
|
|
138
|
+
request_name=request_names.RequestName.SERVE_SYNC_DOWN_LOGS,
|
|
138
139
|
request_body=download_logs_body,
|
|
139
140
|
func=core.sync_down_logs,
|
|
140
141
|
schedule_type=api_requests.ScheduleType.SHORT,
|