skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/skylet/job_lib.py
CHANGED
|
@@ -66,6 +66,7 @@ class JobInfoLoc(enum.IntEnum):
|
|
|
66
66
|
PID = 9
|
|
67
67
|
LOG_PATH = 10
|
|
68
68
|
METADATA = 11
|
|
69
|
+
EXIT_CODES = 12
|
|
69
70
|
|
|
70
71
|
|
|
71
72
|
def create_table(cursor, conn):
|
|
@@ -124,6 +125,8 @@ def create_table(cursor, conn):
|
|
|
124
125
|
'metadata',
|
|
125
126
|
'TEXT DEFAULT \'{}\'',
|
|
126
127
|
value_to_replace_existing_entries='{}')
|
|
128
|
+
db_utils.add_column_to_table(cursor, conn, 'jobs', 'exit_codes',
|
|
129
|
+
'TEXT DEFAULT NULL')
|
|
127
130
|
conn.commit()
|
|
128
131
|
|
|
129
132
|
|
|
@@ -388,10 +391,16 @@ def add_job(job_name: str,
|
|
|
388
391
|
assert _DB is not None
|
|
389
392
|
job_submitted_at = time.time()
|
|
390
393
|
# job_id will autoincrement with the null value
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
394
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
395
|
+
_DB.cursor.execute(
|
|
396
|
+
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?, null)', # pylint: disable=line-too-long
|
|
397
|
+
(job_name, username, job_submitted_at, JobStatus.INIT.value,
|
|
398
|
+
run_timestamp, None, resources_str, metadata))
|
|
399
|
+
else:
|
|
400
|
+
_DB.cursor.execute(
|
|
401
|
+
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)', # pylint: disable=line-too-long
|
|
402
|
+
(job_name, username, job_submitted_at, JobStatus.INIT.value,
|
|
403
|
+
run_timestamp, None, resources_str, metadata))
|
|
395
404
|
_DB.conn.commit()
|
|
396
405
|
rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
|
|
397
406
|
(run_timestamp,))
|
|
@@ -468,6 +477,41 @@ def set_status(job_id: int, status: JobStatus) -> None:
|
|
|
468
477
|
_set_status_no_lock(job_id, status)
|
|
469
478
|
|
|
470
479
|
|
|
480
|
+
@init_db
|
|
481
|
+
def set_exit_codes(job_id: int, exit_codes: List[int]) -> None:
|
|
482
|
+
"""Set exit codes for a job as comma-separated string.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
job_id: The job ID to update.
|
|
486
|
+
exit_codes: A list of exit codes to store.
|
|
487
|
+
"""
|
|
488
|
+
assert _DB is not None
|
|
489
|
+
exit_codes_str = ','.join(str(code) for code in exit_codes)
|
|
490
|
+
with filelock.FileLock(_get_lock_path(job_id)):
|
|
491
|
+
_DB.cursor.execute('UPDATE jobs SET exit_codes=(?) WHERE job_id=(?)',
|
|
492
|
+
(exit_codes_str, job_id))
|
|
493
|
+
_DB.conn.commit()
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
@init_db
|
|
497
|
+
def get_exit_codes(job_id: int) -> Optional[List[int]]:
|
|
498
|
+
"""Get exit codes for a job from comma-separated string.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
job_id: The job ID to retrieve exit codes for.
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
A list of exit codes, or None if not found.
|
|
505
|
+
"""
|
|
506
|
+
assert _DB is not None
|
|
507
|
+
rows = _DB.cursor.execute('SELECT exit_codes FROM jobs WHERE job_id=(?)',
|
|
508
|
+
(job_id,))
|
|
509
|
+
row = rows.fetchone()
|
|
510
|
+
if row is None or row[0] is None:
|
|
511
|
+
return None
|
|
512
|
+
return [int(code) for code in row[0].split(',')]
|
|
513
|
+
|
|
514
|
+
|
|
471
515
|
@init_db
|
|
472
516
|
def set_job_started(job_id: int) -> None:
|
|
473
517
|
# TODO(mraheja): remove pylint disabling when filelock version updated.
|
|
@@ -506,6 +550,20 @@ def get_status(job_id: int) -> Optional[JobStatus]:
|
|
|
506
550
|
return get_status_no_lock(job_id)
|
|
507
551
|
|
|
508
552
|
|
|
553
|
+
def wait_for_job_completion(job_id: int, poll_interval: float = 1.0) -> None:
|
|
554
|
+
"""Wait for a job to reach a terminal state.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
job_id: The job ID to wait for.
|
|
558
|
+
poll_interval: How often to poll the job status in seconds.
|
|
559
|
+
"""
|
|
560
|
+
while True:
|
|
561
|
+
status = get_status(job_id)
|
|
562
|
+
if status is None or status.is_terminal():
|
|
563
|
+
break
|
|
564
|
+
time.sleep(poll_interval)
|
|
565
|
+
|
|
566
|
+
|
|
509
567
|
@init_db
|
|
510
568
|
def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
511
569
|
return message_utils.encode_payload(get_statuses(job_ids))
|
|
@@ -674,6 +732,14 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
|
|
|
674
732
|
'pid': row[JobInfoLoc.PID.value],
|
|
675
733
|
'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
|
|
676
734
|
})
|
|
735
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
736
|
+
exit_code_str = row[JobInfoLoc.EXIT_CODES.value]
|
|
737
|
+
if not isinstance(exit_code_str, str):
|
|
738
|
+
records[-1]['exit_codes'] = None
|
|
739
|
+
else:
|
|
740
|
+
records[-1]['exit_codes'] = ([
|
|
741
|
+
int(code) for code in exit_code_str.split(',')
|
|
742
|
+
])
|
|
677
743
|
return records
|
|
678
744
|
|
|
679
745
|
|
|
@@ -1152,6 +1218,15 @@ class JobLibCodeGen:
|
|
|
1152
1218
|
]
|
|
1153
1219
|
return cls._build(code)
|
|
1154
1220
|
|
|
1221
|
+
@classmethod
|
|
1222
|
+
def wait_for_job(cls, job_id: int) -> str:
|
|
1223
|
+
code = [
|
|
1224
|
+
# TODO(kevin): backward compatibility, remove in 0.13.0.
|
|
1225
|
+
(f'job_lib.wait_for_job_completion({job_id!r}) if '
|
|
1226
|
+
'hasattr(job_lib, "wait_for_job_completion") else None'),
|
|
1227
|
+
]
|
|
1228
|
+
return cls._build(code)
|
|
1229
|
+
|
|
1155
1230
|
@classmethod
|
|
1156
1231
|
def update_status(cls) -> str:
|
|
1157
1232
|
code = ['job_lib.update_status()']
|
|
@@ -1269,8 +1344,19 @@ class JobLibCodeGen:
|
|
|
1269
1344
|
]
|
|
1270
1345
|
return cls._build(code)
|
|
1271
1346
|
|
|
1347
|
+
@classmethod
|
|
1348
|
+
def get_job_exit_codes(cls, job_id: Optional[int] = None) -> str:
|
|
1349
|
+
"""Generate shell command to retrieve exit codes."""
|
|
1350
|
+
code = [
|
|
1351
|
+
f'job_id = {job_id} if {job_id} is not None else job_lib.get_latest_job_id()', # pylint: disable=line-too-long
|
|
1352
|
+
'exit_codes = job_lib.get_exit_codes(job_id) if job_id is not None and int(constants.SKYLET_VERSION) >= 28 else {}', # pylint: disable=line-too-long
|
|
1353
|
+
'print(exit_codes, flush=True)',
|
|
1354
|
+
]
|
|
1355
|
+
return cls._build(code)
|
|
1356
|
+
|
|
1272
1357
|
@classmethod
|
|
1273
1358
|
def _build(cls, code: List[str]) -> str:
|
|
1274
1359
|
code = cls._PREFIX + code
|
|
1275
1360
|
code = ';'.join(code)
|
|
1276
|
-
return f'{constants.
|
|
1361
|
+
return (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
|
|
1362
|
+
f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(code)}')
|
sky/skylet/log_lib.py
CHANGED
|
@@ -172,7 +172,7 @@ def run_with_log(
|
|
|
172
172
|
streaming_prefix: Optional[str] = None,
|
|
173
173
|
log_cmd: bool = False,
|
|
174
174
|
**kwargs,
|
|
175
|
-
) -> Union[int, Tuple[int, str, str]]:
|
|
175
|
+
) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
|
|
176
176
|
"""Runs a command and logs its output to a file.
|
|
177
177
|
|
|
178
178
|
Args:
|
|
@@ -183,6 +183,8 @@ def run_with_log(
|
|
|
183
183
|
process_stream: Whether to post-process the stdout/stderr of the
|
|
184
184
|
command, such as replacing or skipping lines on the fly. If
|
|
185
185
|
enabled, lines are printed only when '\r' or '\n' is found.
|
|
186
|
+
streaming_prefix: Optional prefix for each log line. Can contain {pid}
|
|
187
|
+
placeholder which will be replaced with the subprocess PID.
|
|
186
188
|
|
|
187
189
|
Returns the returncode or returncode, stdout and stderr of the command.
|
|
188
190
|
Note that the stdout and stderr is already decoded.
|
|
@@ -228,6 +230,13 @@ def run_with_log(
|
|
|
228
230
|
# For backward compatibility, do not specify use_kill_pg by
|
|
229
231
|
# default.
|
|
230
232
|
subprocess_utils.kill_process_daemon(proc.pid)
|
|
233
|
+
|
|
234
|
+
# Format streaming_prefix with subprocess PID if it contains {pid}
|
|
235
|
+
formatted_streaming_prefix = streaming_prefix
|
|
236
|
+
if streaming_prefix and '{pid}' in streaming_prefix:
|
|
237
|
+
formatted_streaming_prefix = streaming_prefix.format(
|
|
238
|
+
pid=proc.pid)
|
|
239
|
+
|
|
231
240
|
stdout = ''
|
|
232
241
|
stderr = ''
|
|
233
242
|
stdout_stream_handler = None
|
|
@@ -256,7 +265,7 @@ def run_with_log(
|
|
|
256
265
|
line_processor=line_processor,
|
|
257
266
|
# Replace CRLF when the output is logged to driver by ray.
|
|
258
267
|
replace_crlf=with_ray,
|
|
259
|
-
streaming_prefix=
|
|
268
|
+
streaming_prefix=formatted_streaming_prefix,
|
|
260
269
|
)
|
|
261
270
|
stdout_stream_handler = functools.partial(
|
|
262
271
|
_handle_io_stream,
|
|
@@ -349,7 +358,8 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
349
358
|
log_path: str,
|
|
350
359
|
env_vars: Optional[Dict[str, str]] = None,
|
|
351
360
|
stream_logs: bool = False,
|
|
352
|
-
with_ray: bool = False
|
|
361
|
+
with_ray: bool = False,
|
|
362
|
+
streaming_prefix: Optional[str] = None):
|
|
353
363
|
with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
|
|
354
364
|
delete=False) as fp:
|
|
355
365
|
bash_command = make_task_bash_script(bash_command, env_vars=env_vars)
|
|
@@ -364,6 +374,7 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
364
374
|
log_path,
|
|
365
375
|
stream_logs=stream_logs,
|
|
366
376
|
with_ray=with_ray,
|
|
377
|
+
streaming_prefix=streaming_prefix,
|
|
367
378
|
shell=True)
|
|
368
379
|
|
|
369
380
|
|
|
@@ -372,9 +383,14 @@ def run_bash_command_with_log_and_return_pid(
|
|
|
372
383
|
log_path: str,
|
|
373
384
|
env_vars: Optional[Dict[str, str]] = None,
|
|
374
385
|
stream_logs: bool = False,
|
|
375
|
-
with_ray: bool = False
|
|
376
|
-
|
|
377
|
-
|
|
386
|
+
with_ray: bool = False,
|
|
387
|
+
streaming_prefix: Optional[str] = None):
|
|
388
|
+
return_code = run_bash_command_with_log(bash_command,
|
|
389
|
+
log_path,
|
|
390
|
+
env_vars,
|
|
391
|
+
stream_logs,
|
|
392
|
+
with_ray,
|
|
393
|
+
streaming_prefix=streaming_prefix)
|
|
378
394
|
return {'return_code': return_code, 'pid': os.getpid()}
|
|
379
395
|
|
|
380
396
|
|
sky/skylet/log_lib.pyi
CHANGED
|
@@ -68,7 +68,7 @@ def run_with_log(cmd: Union[List[str], str],
|
|
|
68
68
|
process_stream: bool = ...,
|
|
69
69
|
line_processor: Optional[log_utils.LineProcessor] = ...,
|
|
70
70
|
streaming_prefix: Optional[str] = ...,
|
|
71
|
-
|
|
71
|
+
log_cmd: bool = ...,
|
|
72
72
|
**kwargs) -> int:
|
|
73
73
|
...
|
|
74
74
|
|
|
@@ -87,7 +87,7 @@ def run_with_log(cmd: Union[List[str], str],
|
|
|
87
87
|
process_stream: bool = ...,
|
|
88
88
|
line_processor: Optional[log_utils.LineProcessor] = ...,
|
|
89
89
|
streaming_prefix: Optional[str] = ...,
|
|
90
|
-
|
|
90
|
+
log_cmd: bool = ...,
|
|
91
91
|
**kwargs) -> Tuple[int, str, str]:
|
|
92
92
|
...
|
|
93
93
|
|
|
@@ -106,8 +106,8 @@ def run_with_log(cmd: Union[List[str], str],
|
|
|
106
106
|
process_stream: bool = ...,
|
|
107
107
|
line_processor: Optional[log_utils.LineProcessor] = ...,
|
|
108
108
|
streaming_prefix: Optional[str] = ...,
|
|
109
|
-
|
|
110
|
-
**kwargs) ->
|
|
109
|
+
log_cmd: bool = ...,
|
|
110
|
+
**kwargs) -> Tuple[int, int]:
|
|
111
111
|
...
|
|
112
112
|
|
|
113
113
|
|
|
@@ -125,7 +125,8 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
125
125
|
log_path: str,
|
|
126
126
|
env_vars: Optional[Dict[str, str]] = ...,
|
|
127
127
|
stream_logs: bool = ...,
|
|
128
|
-
with_ray: bool =
|
|
128
|
+
with_ray: bool = ...,
|
|
129
|
+
streaming_prefix: Optional[str] = ...) -> int:
|
|
129
130
|
...
|
|
130
131
|
|
|
131
132
|
|
|
@@ -134,7 +135,8 @@ def run_bash_command_with_log_and_return_pid(
|
|
|
134
135
|
log_path: str,
|
|
135
136
|
env_vars: Optional[Dict[str, str]] = ...,
|
|
136
137
|
stream_logs: bool = ...,
|
|
137
|
-
with_ray: bool =
|
|
138
|
+
with_ray: bool = ...,
|
|
139
|
+
streaming_prefix: Optional[str] = ...) -> Dict[str, Union[int, str]]:
|
|
138
140
|
...
|
|
139
141
|
|
|
140
142
|
|
sky/skylet/services.py
CHANGED
|
@@ -197,12 +197,11 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
197
197
|
f.write(request.codegen)
|
|
198
198
|
os.chmod(script_path, 0o755)
|
|
199
199
|
|
|
200
|
-
cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
|
|
201
200
|
job_submit_cmd = (
|
|
202
201
|
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
203
202
|
# retrieved with pid is the same driver process.
|
|
204
203
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
205
|
-
f'{
|
|
204
|
+
f'{constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
206
205
|
# Do not use &>, which is not POSIX and may not work.
|
|
207
206
|
# Note that the order of ">filename 2>&1" matters.
|
|
208
207
|
f' > {remote_log_path} 2>&1')
|
|
@@ -387,6 +386,21 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
387
386
|
except Exception as e: # pylint: disable=broad-except
|
|
388
387
|
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
389
388
|
|
|
389
|
+
def GetJobExitCodes( # type: ignore[return]
|
|
390
|
+
self, request: jobsv1_pb2.GetJobExitCodesRequest,
|
|
391
|
+
context: grpc.ServicerContext
|
|
392
|
+
) -> jobsv1_pb2.GetJobExitCodesResponse:
|
|
393
|
+
try:
|
|
394
|
+
job_id = request.job_id if request.HasField(
|
|
395
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
396
|
+
exit_codes: Optional[List[int]] = None
|
|
397
|
+
if job_id:
|
|
398
|
+
exit_codes_list = job_lib.get_exit_codes(job_id)
|
|
399
|
+
exit_codes = exit_codes_list if exit_codes_list else []
|
|
400
|
+
return jobsv1_pb2.GetJobExitCodesResponse(exit_codes=exit_codes)
|
|
401
|
+
except Exception as e: # pylint: disable=broad-except
|
|
402
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
403
|
+
|
|
390
404
|
|
|
391
405
|
class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
392
406
|
):
|
|
@@ -488,7 +502,8 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
488
502
|
entrypoint=job.get('entrypoint'),
|
|
489
503
|
metadata=converted_metadata,
|
|
490
504
|
pool=job.get('pool'),
|
|
491
|
-
pool_hash=job.get('pool_hash')
|
|
505
|
+
pool_hash=job.get('pool_hash'),
|
|
506
|
+
links=job.get('links'))
|
|
492
507
|
jobs_info.append(job_info)
|
|
493
508
|
|
|
494
509
|
return managed_jobsv1_pb2.GetJobTableResponse(
|
sky/skylet/skylet.py
CHANGED
|
@@ -48,8 +48,12 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
|
48
48
|
# putting it here for visibility.
|
|
49
49
|
# TODO(kevin): Determine the optimal max number of threads.
|
|
50
50
|
max_workers = min(32, (os.cpu_count() or 1) + 4)
|
|
51
|
+
# There's only a single skylet process per cluster, so disable
|
|
52
|
+
# SO_REUSEPORT to raise an error if the port is already in use.
|
|
53
|
+
options = (('grpc.so_reuseport', 0),)
|
|
51
54
|
server = grpc.server(
|
|
52
|
-
concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
|
|
55
|
+
concurrent.futures.ThreadPoolExecutor(max_workers=max_workers),
|
|
56
|
+
options=options)
|
|
53
57
|
|
|
54
58
|
autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
|
|
55
59
|
services.AutostopServiceImpl(), server)
|
sky/skylet/subprocess_daemon.py
CHANGED
|
@@ -110,7 +110,8 @@ def kill_process_tree(process: psutil.Process,
|
|
|
110
110
|
|
|
111
111
|
|
|
112
112
|
def main():
|
|
113
|
-
|
|
113
|
+
daemonize()
|
|
114
|
+
|
|
114
115
|
parser = argparse.ArgumentParser()
|
|
115
116
|
parser.add_argument('--parent-pid', type=int, required=True)
|
|
116
117
|
parser.add_argument('--proc-pid', type=int, required=True)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Constants for SSH Node Pools"""
|
|
2
|
+
# pylint: disable=line-too-long
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
|
6
|
+
SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
|
|
7
|
+
NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
|
|
8
|
+
NODE_POOLS_KEY_DIR = os.path.expanduser('~/.sky/ssh_keys')
|
|
9
|
+
DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
|
|
10
|
+
|
|
11
|
+
# TODO (kyuds): make this configurable?
|
|
12
|
+
K3S_TOKEN = 'mytoken' # Any string can be used as the token
|
sky/ssh_node_pools/core.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
"""SSH Node Pool management core functionality."""
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, Dict, List
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
7
|
|
|
8
|
+
from sky import clouds
|
|
9
|
+
from sky.ssh_node_pools import constants
|
|
10
|
+
from sky.ssh_node_pools import deploy
|
|
11
|
+
from sky.usage import usage_lib
|
|
12
|
+
from sky.utils import common_utils
|
|
8
13
|
from sky.utils import yaml_utils
|
|
9
14
|
|
|
10
15
|
|
|
@@ -12,8 +17,8 @@ class SSHNodePoolManager:
|
|
|
12
17
|
"""Manager for SSH Node Pool configurations."""
|
|
13
18
|
|
|
14
19
|
def __init__(self):
|
|
15
|
-
self.config_path = Path.
|
|
16
|
-
self.keys_dir = Path.
|
|
20
|
+
self.config_path = Path(constants.DEFAULT_SSH_NODE_POOLS_PATH)
|
|
21
|
+
self.keys_dir = Path(constants.NODE_POOLS_KEY_DIR)
|
|
17
22
|
self.keys_dir.mkdir(parents=True, exist_ok=True)
|
|
18
23
|
|
|
19
24
|
def get_all_pools(self) -> Dict[str, Any]:
|
|
@@ -133,3 +138,35 @@ def list_ssh_keys() -> List[str]:
|
|
|
133
138
|
"""List available SSH keys."""
|
|
134
139
|
manager = SSHNodePoolManager()
|
|
135
140
|
return manager.list_ssh_keys()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@usage_lib.entrypoint
|
|
144
|
+
def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
|
|
145
|
+
"""Deploys or tears down a Kubernetes cluster on SSH targets.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
infra: Name of the cluster configuration in ssh_node_pools.yaml.
|
|
149
|
+
If None, the first cluster in the file is used.
|
|
150
|
+
cleanup: If True, clean up the cluster instead of deploying.
|
|
151
|
+
"""
|
|
152
|
+
deploy.run(cleanup=cleanup, infra=infra)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@usage_lib.entrypoint
|
|
156
|
+
def ssh_status(context_name: str) -> Tuple[bool, str]:
|
|
157
|
+
"""Check the status of an SSH Node Pool context.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
context_name: The SSH context name (e.g., 'ssh-my-cluster')
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Tuple[bool, str]: (is_ready, reason)
|
|
164
|
+
- is_ready: True if the SSH Node Pool is ready, False otherwise
|
|
165
|
+
- reason: Explanation of the status
|
|
166
|
+
"""
|
|
167
|
+
try:
|
|
168
|
+
is_ready, reason = clouds.SSH.check_single_context(context_name)
|
|
169
|
+
return is_ready, reason
|
|
170
|
+
except Exception as e: # pylint: disable=broad-except
|
|
171
|
+
return False, ('Failed to check SSH context: '
|
|
172
|
+
f'{common_utils.format_exception(e)}')
|