skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20251001__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +43 -14
- sky/backends/cloud_vm_ray_backend.py +153 -38
- sky/check.py +0 -29
- sky/client/cli/command.py +48 -26
- sky/client/cli/table_utils.py +91 -0
- sky/client/sdk.py +14 -23
- sky/client/sdk_async.py +5 -5
- sky/core.py +18 -20
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-07349868f7905d37.js → [pool]-509b2977a6373bf6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-4f0c389a4ce5fd9c.js} +1 -1
- sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -0
- sky/data/storage_utils.py +1 -45
- sky/execution.py +0 -1
- sky/global_user_state.py +3 -3
- sky/jobs/client/sdk.py +3 -2
- sky/jobs/controller.py +15 -0
- sky/jobs/server/core.py +120 -28
- sky/jobs/server/server.py +1 -1
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +87 -8
- sky/provision/kubernetes/instance.py +1 -1
- sky/schemas/api/responses.py +73 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/common.py +2 -1
- sky/server/requests/serializers/decoders.py +10 -6
- sky/server/requests/serializers/encoders.py +13 -8
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- sky/task.py +4 -0
- sky/utils/cluster_utils.py +23 -5
- sky/utils/command_runner.py +21 -5
- sky/utils/command_runner.pyi +11 -0
- sky/utils/volume.py +5 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/METADATA +35 -35
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/RECORD +70 -66
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/top_level.txt +0 -0
sky/skylet/services.py
CHANGED
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
"""gRPC service implementations for skylet."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
from typing import List, Optional
|
|
4
5
|
|
|
5
6
|
import grpc
|
|
6
7
|
|
|
8
|
+
from sky import exceptions
|
|
7
9
|
from sky import sky_logging
|
|
8
10
|
from sky.jobs import state as managed_job_state
|
|
11
|
+
from sky.jobs import utils as managed_job_utils
|
|
9
12
|
from sky.schemas.generated import autostopv1_pb2
|
|
10
13
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
11
14
|
from sky.schemas.generated import jobsv1_pb2
|
|
12
15
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
16
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
17
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
13
18
|
from sky.schemas.generated import servev1_pb2
|
|
14
19
|
from sky.schemas.generated import servev1_pb2_grpc
|
|
15
20
|
from sky.serve import serve_rpc_utils
|
|
@@ -18,9 +23,14 @@ from sky.serve import serve_utils
|
|
|
18
23
|
from sky.skylet import autostop_lib
|
|
19
24
|
from sky.skylet import constants
|
|
20
25
|
from sky.skylet import job_lib
|
|
26
|
+
from sky.skylet import log_lib
|
|
21
27
|
|
|
22
28
|
logger = sky_logging.init_logger(__name__)
|
|
23
29
|
|
|
30
|
+
# In the worst case, flush the log buffer every 50ms,
|
|
31
|
+
# to ensure responsiveness.
|
|
32
|
+
DEFAULT_LOG_CHUNK_FLUSH_INTERVAL = 0.05
|
|
33
|
+
|
|
24
34
|
|
|
25
35
|
class AutostopServiceImpl(autostopv1_pb2_grpc.AutostopServiceServicer):
|
|
26
36
|
"""Implementation of the AutostopService gRPC service."""
|
|
@@ -275,8 +285,39 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
275
285
|
self,
|
|
276
286
|
request: jobsv1_pb2.TailLogsRequest, # type: ignore[return]
|
|
277
287
|
context: grpc.ServicerContext):
|
|
278
|
-
|
|
279
|
-
|
|
288
|
+
buffer = log_lib.LogBuffer()
|
|
289
|
+
try:
|
|
290
|
+
job_id = request.job_id if request.HasField(
|
|
291
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
292
|
+
managed_job_id = request.managed_job_id if request.HasField(
|
|
293
|
+
'managed_job_id') else None
|
|
294
|
+
log_dir = job_lib.get_log_dir_for_job(job_id)
|
|
295
|
+
if log_dir is None:
|
|
296
|
+
run_timestamp = job_lib.get_run_timestamp(job_id)
|
|
297
|
+
log_dir = None if run_timestamp is None else os.path.join(
|
|
298
|
+
constants.SKY_LOGS_DIRECTORY, run_timestamp)
|
|
299
|
+
|
|
300
|
+
for line in log_lib.buffered_iter_with_timeout(
|
|
301
|
+
buffer,
|
|
302
|
+
log_lib.tail_logs_iter(job_id, log_dir, managed_job_id,
|
|
303
|
+
request.follow, request.tail),
|
|
304
|
+
DEFAULT_LOG_CHUNK_FLUSH_INTERVAL):
|
|
305
|
+
yield jobsv1_pb2.TailLogsResponse(log_line=line)
|
|
306
|
+
|
|
307
|
+
job_status = job_lib.get_status(job_id)
|
|
308
|
+
exit_code = exceptions.JobExitCode.from_job_status(job_status)
|
|
309
|
+
# Fix for dashboard: When follow=False and job is still running
|
|
310
|
+
# (NOT_FINISHED=101), exit with success (0) since fetching current
|
|
311
|
+
# logs is a successful operation.
|
|
312
|
+
# This prevents shell wrappers from printing "command terminated
|
|
313
|
+
# with exit code 101".
|
|
314
|
+
exit_code_int = 0 if not request.follow and int(
|
|
315
|
+
exit_code) == 101 else int(exit_code)
|
|
316
|
+
yield jobsv1_pb2.TailLogsResponse(exit_code=exit_code_int)
|
|
317
|
+
except Exception as e: # pylint: disable=broad-except
|
|
318
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
319
|
+
finally:
|
|
320
|
+
buffer.close()
|
|
280
321
|
|
|
281
322
|
def GetJobStatus( # type: ignore[return]
|
|
282
323
|
self, request: jobsv1_pb2.GetJobStatusRequest,
|
|
@@ -343,3 +384,168 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
343
384
|
job_log_dirs=job_log_dirs)
|
|
344
385
|
except Exception as e: # pylint: disable=broad-except
|
|
345
386
|
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
390
|
+
):
|
|
391
|
+
"""Implementation of the ManagedJobsService gRPC service."""
|
|
392
|
+
|
|
393
|
+
def GetVersion( # type: ignore[return]
|
|
394
|
+
self, request: managed_jobsv1_pb2.GetVersionRequest,
|
|
395
|
+
context: grpc.ServicerContext
|
|
396
|
+
) -> managed_jobsv1_pb2.GetVersionResponse:
|
|
397
|
+
try:
|
|
398
|
+
return managed_jobsv1_pb2.GetVersionResponse(
|
|
399
|
+
controller_version=constants.SKYLET_VERSION)
|
|
400
|
+
except Exception as e: # pylint: disable=broad-except
|
|
401
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
402
|
+
|
|
403
|
+
def GetJobTable( # type: ignore[return]
|
|
404
|
+
self, request: managed_jobsv1_pb2.GetJobTableRequest,
|
|
405
|
+
context: grpc.ServicerContext
|
|
406
|
+
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
407
|
+
try:
|
|
408
|
+
accessible_workspaces = list(request.accessible_workspaces)
|
|
409
|
+
job_ids = list(request.job_ids.ids) if request.job_ids else None
|
|
410
|
+
user_hashes: Optional[List[Optional[str]]] = None
|
|
411
|
+
if request.user_hashes:
|
|
412
|
+
user_hashes = list(request.user_hashes.hashes)
|
|
413
|
+
# For backwards compatibility, we show jobs that do not have a
|
|
414
|
+
# user_hash. TODO: Remove before 0.12.0.
|
|
415
|
+
if request.show_jobs_without_user_hash:
|
|
416
|
+
user_hashes.append(None)
|
|
417
|
+
statuses = list(
|
|
418
|
+
request.statuses.statuses) if request.statuses else None
|
|
419
|
+
|
|
420
|
+
job_queue = managed_job_utils.get_managed_job_queue(
|
|
421
|
+
skip_finished=request.skip_finished,
|
|
422
|
+
accessible_workspaces=accessible_workspaces,
|
|
423
|
+
job_ids=job_ids,
|
|
424
|
+
workspace_match=request.workspace_match
|
|
425
|
+
if request.HasField('workspace_match') else None,
|
|
426
|
+
name_match=request.name_match
|
|
427
|
+
if request.HasField('name_match') else None,
|
|
428
|
+
pool_match=request.pool_match
|
|
429
|
+
if request.HasField('pool_match') else None,
|
|
430
|
+
page=request.page if request.HasField('page') else None,
|
|
431
|
+
limit=request.limit if request.HasField('limit') else None,
|
|
432
|
+
user_hashes=user_hashes,
|
|
433
|
+
statuses=statuses)
|
|
434
|
+
jobs = job_queue['jobs']
|
|
435
|
+
total = job_queue['total']
|
|
436
|
+
total_no_filter = job_queue['total_no_filter']
|
|
437
|
+
status_counts = job_queue['status_counts']
|
|
438
|
+
|
|
439
|
+
jobs_info = []
|
|
440
|
+
for job in jobs:
|
|
441
|
+
job_info = managed_jobsv1_pb2.ManagedJobInfo(
|
|
442
|
+
job_id=job.get('job_id'),
|
|
443
|
+
task_id=job.get('task_id'),
|
|
444
|
+
job_name=job.get('job_name'),
|
|
445
|
+
task_name=job.get('task_name'),
|
|
446
|
+
job_duration=job.get('job_duration'),
|
|
447
|
+
workspace=job.get('workspace'),
|
|
448
|
+
status=managed_job_state.ManagedJobStatus(
|
|
449
|
+
job.get('status')).to_protobuf(),
|
|
450
|
+
schedule_state=managed_job_state.ManagedJobScheduleState(
|
|
451
|
+
job.get('schedule_state')).to_protobuf(),
|
|
452
|
+
resources=job.get('resources'),
|
|
453
|
+
cluster_resources=job.get('cluster_resources'),
|
|
454
|
+
cluster_resources_full=job.get('cluster_resources_full'),
|
|
455
|
+
cloud=job.get('cloud'),
|
|
456
|
+
region=job.get('region'),
|
|
457
|
+
infra=job.get('infra'),
|
|
458
|
+
accelerators=job.get('accelerators'),
|
|
459
|
+
recovery_count=job.get('recovery_count'),
|
|
460
|
+
details=job.get('details'),
|
|
461
|
+
failure_reason=job.get('failure_reason'),
|
|
462
|
+
user_name=job.get('user_name'),
|
|
463
|
+
user_hash=job.get('user_hash'),
|
|
464
|
+
submitted_at=job.get('submitted_at'),
|
|
465
|
+
start_at=job.get('start_at'),
|
|
466
|
+
end_at=job.get('end_at'),
|
|
467
|
+
user_yaml=job.get('user_yaml'),
|
|
468
|
+
entrypoint=job.get('entrypoint'),
|
|
469
|
+
metadata={
|
|
470
|
+
k: v
|
|
471
|
+
for k, v in job.get('metadata', {}).items()
|
|
472
|
+
if v is not None
|
|
473
|
+
},
|
|
474
|
+
pool=job.get('pool'),
|
|
475
|
+
pool_hash=job.get('pool_hash'))
|
|
476
|
+
jobs_info.append(job_info)
|
|
477
|
+
|
|
478
|
+
return managed_jobsv1_pb2.GetJobTableResponse(
|
|
479
|
+
jobs=jobs_info,
|
|
480
|
+
total=total,
|
|
481
|
+
total_no_filter=total_no_filter,
|
|
482
|
+
status_counts=status_counts)
|
|
483
|
+
except Exception as e: # pylint: disable=broad-except
|
|
484
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
485
|
+
|
|
486
|
+
def GetAllJobIdsByName( # type: ignore[return]
|
|
487
|
+
self, request: managed_jobsv1_pb2.GetAllJobIdsByNameRequest,
|
|
488
|
+
context: grpc.ServicerContext
|
|
489
|
+
) -> managed_jobsv1_pb2.GetAllJobIdsByNameResponse:
|
|
490
|
+
try:
|
|
491
|
+
job_name = request.job_name if request.HasField(
|
|
492
|
+
'job_name') else None
|
|
493
|
+
job_ids = managed_job_state.get_all_job_ids_by_name(job_name)
|
|
494
|
+
return managed_jobsv1_pb2.GetAllJobIdsByNameResponse(
|
|
495
|
+
job_ids=job_ids)
|
|
496
|
+
except Exception as e: # pylint: disable=broad-except
|
|
497
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
498
|
+
|
|
499
|
+
def CancelJobs( # type: ignore[return]
|
|
500
|
+
self, request: managed_jobsv1_pb2.CancelJobsRequest,
|
|
501
|
+
context: grpc.ServicerContext
|
|
502
|
+
) -> managed_jobsv1_pb2.CancelJobsResponse:
|
|
503
|
+
try:
|
|
504
|
+
cancellation_criteria = request.WhichOneof('cancellation_criteria')
|
|
505
|
+
if cancellation_criteria is None:
|
|
506
|
+
context.abort(
|
|
507
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
508
|
+
'exactly one cancellation criteria must be specified.')
|
|
509
|
+
|
|
510
|
+
if cancellation_criteria == 'all_users':
|
|
511
|
+
user_hash = request.user_hash if request.HasField(
|
|
512
|
+
'user_hash') else None
|
|
513
|
+
all_users = request.all_users
|
|
514
|
+
if not all_users and user_hash is None:
|
|
515
|
+
context.abort(
|
|
516
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
517
|
+
'user_hash is required when all_users is False')
|
|
518
|
+
message = managed_job_utils.cancel_jobs_by_id(
|
|
519
|
+
job_ids=None,
|
|
520
|
+
all_users=all_users,
|
|
521
|
+
current_workspace=request.current_workspace,
|
|
522
|
+
user_hash=user_hash)
|
|
523
|
+
elif cancellation_criteria == 'job_ids':
|
|
524
|
+
job_ids = list(request.job_ids.ids)
|
|
525
|
+
message = managed_job_utils.cancel_jobs_by_id(
|
|
526
|
+
job_ids=job_ids,
|
|
527
|
+
current_workspace=request.current_workspace)
|
|
528
|
+
elif cancellation_criteria == 'job_name':
|
|
529
|
+
message = managed_job_utils.cancel_job_by_name(
|
|
530
|
+
job_name=request.job_name,
|
|
531
|
+
current_workspace=request.current_workspace)
|
|
532
|
+
elif cancellation_criteria == 'pool_name':
|
|
533
|
+
message = managed_job_utils.cancel_jobs_by_pool(
|
|
534
|
+
pool_name=request.pool_name,
|
|
535
|
+
current_workspace=request.current_workspace)
|
|
536
|
+
else:
|
|
537
|
+
context.abort(
|
|
538
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
539
|
+
f'invalid cancellation criteria: {cancellation_criteria}')
|
|
540
|
+
return managed_jobsv1_pb2.CancelJobsResponse(message=message)
|
|
541
|
+
except Exception as e: # pylint: disable=broad-except
|
|
542
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
543
|
+
|
|
544
|
+
def StreamLogs(
|
|
545
|
+
self,
|
|
546
|
+
request: managed_jobsv1_pb2.
|
|
547
|
+
StreamLogsRequest, # type: ignore[return]
|
|
548
|
+
context: grpc.ServicerContext):
|
|
549
|
+
# TODO(kevin): implement this
|
|
550
|
+
context.abort(grpc.StatusCode.UNIMPLEMENTED,
|
|
551
|
+
'StreamLogs is not implemented')
|
sky/skylet/skylet.py
CHANGED
|
@@ -10,6 +10,7 @@ import sky
|
|
|
10
10
|
from sky import sky_logging
|
|
11
11
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
12
12
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
13
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
13
14
|
from sky.schemas.generated import servev1_pb2_grpc
|
|
14
15
|
from sky.skylet import constants
|
|
15
16
|
from sky.skylet import events
|
|
@@ -55,6 +56,8 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
|
55
56
|
services.JobsServiceImpl(), server)
|
|
56
57
|
servev1_pb2_grpc.add_ServeServiceServicer_to_server(
|
|
57
58
|
services.ServeServiceImpl(), server)
|
|
59
|
+
managed_jobsv1_pb2_grpc.add_ManagedJobsServiceServicer_to_server(
|
|
60
|
+
services.ManagedJobsServiceImpl(), server)
|
|
58
61
|
|
|
59
62
|
listen_addr = f'127.0.0.1:{port}'
|
|
60
63
|
server.add_insecure_port(listen_addr)
|
sky/task.py
CHANGED
|
@@ -649,6 +649,10 @@ class Task:
|
|
|
649
649
|
config['workdir'] = _fill_in_env_vars(config['workdir'],
|
|
650
650
|
env_and_secrets)
|
|
651
651
|
|
|
652
|
+
if config.get('volumes') is not None:
|
|
653
|
+
config['volumes'] = _fill_in_env_vars(config['volumes'],
|
|
654
|
+
env_and_secrets)
|
|
655
|
+
|
|
652
656
|
task = Task(
|
|
653
657
|
config.pop('name', None),
|
|
654
658
|
run=config.pop('run', None),
|
sky/utils/cluster_utils.py
CHANGED
|
@@ -193,11 +193,29 @@ class SSHConfigHelper(object):
|
|
|
193
193
|
proxy_command = auth_config.get('ssh_proxy_command', None)
|
|
194
194
|
|
|
195
195
|
docker_proxy_command_generator = None
|
|
196
|
+
proxy_command_for_nodes = proxy_command
|
|
196
197
|
if docker_user is not None:
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
198
|
+
|
|
199
|
+
def _docker_proxy_cmd(ip: str, port: int) -> str:
|
|
200
|
+
inner_proxy = proxy_command
|
|
201
|
+
inner_port = port or 22
|
|
202
|
+
if inner_proxy is not None:
|
|
203
|
+
inner_proxy = inner_proxy.replace('%h', ip)
|
|
204
|
+
inner_proxy = inner_proxy.replace('%p', str(inner_port))
|
|
205
|
+
return ' '.join(['ssh'] + command_runner.ssh_options_list(
|
|
206
|
+
key_path,
|
|
207
|
+
ssh_control_name=None,
|
|
208
|
+
ssh_proxy_command=inner_proxy,
|
|
209
|
+
port=inner_port,
|
|
210
|
+
# ProxyCommand (ssh -W) is a forwarding tunnel, not an
|
|
211
|
+
# interactive session. ControlMaster would cache these
|
|
212
|
+
# processes, causing them to hang and block subsequent
|
|
213
|
+
# connections. Each ProxyCommand should be ephemeral.
|
|
214
|
+
disable_control_master=True
|
|
215
|
+
) + ['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
|
|
216
|
+
|
|
217
|
+
docker_proxy_command_generator = _docker_proxy_cmd
|
|
218
|
+
proxy_command_for_nodes = None
|
|
201
219
|
|
|
202
220
|
codegen = ''
|
|
203
221
|
# Add the nodes to the codegen
|
|
@@ -212,7 +230,7 @@ class SSHConfigHelper(object):
|
|
|
212
230
|
# TODO(romilb): Update port number when k8s supports multinode
|
|
213
231
|
codegen += cls._get_generated_config(
|
|
214
232
|
sky_autogen_comment, node_name, ip, username,
|
|
215
|
-
key_path_for_config,
|
|
233
|
+
key_path_for_config, proxy_command_for_nodes, port,
|
|
216
234
|
docker_proxy_command) + '\n'
|
|
217
235
|
|
|
218
236
|
cluster_config_path = os.path.expanduser(
|
sky/utils/command_runner.py
CHANGED
|
@@ -652,15 +652,31 @@ class SSHCommandRunner(CommandRunner):
|
|
|
652
652
|
if docker_user is not None:
|
|
653
653
|
assert port is None or port == 22, (
|
|
654
654
|
f'port must be None or 22 for docker_user, got {port}.')
|
|
655
|
-
#
|
|
656
|
-
|
|
657
|
-
|
|
655
|
+
# When connecting via docker, the outer SSH hop points to the
|
|
656
|
+
# container's sshd (localhost). Preserve the user proxy for the
|
|
657
|
+
# inner hop that reaches the host VM, and clear the outer proxy to
|
|
658
|
+
# avoid forwarding localhost through the jump host.
|
|
659
|
+
inner_proxy_command = ssh_proxy_command
|
|
660
|
+
inner_proxy_port = port or 22
|
|
661
|
+
self._ssh_proxy_command = None
|
|
658
662
|
self.ip = 'localhost'
|
|
659
663
|
self.ssh_user = docker_user
|
|
660
664
|
self.port = constants.DEFAULT_DOCKER_PORT
|
|
665
|
+
if inner_proxy_command is not None:
|
|
666
|
+
# Replace %h/%p placeholders with actual host values, since the
|
|
667
|
+
# final destination from the perspective of the user proxy is
|
|
668
|
+
# the host VM (ip, inner_proxy_port).
|
|
669
|
+
inner_proxy_command = inner_proxy_command.replace('%h', ip)
|
|
670
|
+
inner_proxy_command = inner_proxy_command.replace(
|
|
671
|
+
'%p', str(inner_proxy_port))
|
|
661
672
|
self._docker_ssh_proxy_command = lambda ssh: ' '.join(
|
|
662
|
-
ssh + ssh_options_list(ssh_private_key,
|
|
663
|
-
|
|
673
|
+
ssh + ssh_options_list(ssh_private_key,
|
|
674
|
+
None,
|
|
675
|
+
ssh_proxy_command=inner_proxy_command,
|
|
676
|
+
port=inner_proxy_port,
|
|
677
|
+
disable_control_master=self.
|
|
678
|
+
disable_control_master) +
|
|
679
|
+
['-W', '%h:%p', f'{ssh_user}@{ip}'])
|
|
664
680
|
else:
|
|
665
681
|
self.ip = ip
|
|
666
682
|
self.ssh_user = ssh_user
|
sky/utils/command_runner.pyi
CHANGED
|
@@ -142,8 +142,10 @@ class SSHCommandRunner(CommandRunner):
|
|
|
142
142
|
ssh_user: str,
|
|
143
143
|
ssh_private_key: str,
|
|
144
144
|
ssh_control_name: Optional[str] = ...,
|
|
145
|
+
ssh_proxy_command: Optional[str] = ...,
|
|
145
146
|
docker_user: Optional[str] = ...,
|
|
146
147
|
disable_control_master: Optional[bool] = ...,
|
|
148
|
+
port_forward_execute_remote_command: Optional[bool] = ...,
|
|
147
149
|
) -> None:
|
|
148
150
|
...
|
|
149
151
|
|
|
@@ -198,6 +200,15 @@ class SSHCommandRunner(CommandRunner):
|
|
|
198
200
|
**kwargs) -> Union[Tuple[int, str, str], int]:
|
|
199
201
|
...
|
|
200
202
|
|
|
203
|
+
def ssh_base_command(
|
|
204
|
+
self,
|
|
205
|
+
*,
|
|
206
|
+
ssh_mode: SshMode,
|
|
207
|
+
port_forward: Optional[List[Tuple[int, int]]],
|
|
208
|
+
connect_timeout: Optional[int],
|
|
209
|
+
) -> List[str]:
|
|
210
|
+
...
|
|
211
|
+
|
|
201
212
|
def rsync(self,
|
|
202
213
|
source: str,
|
|
203
214
|
target: str,
|
sky/utils/volume.py
CHANGED
|
@@ -26,6 +26,11 @@ class VolumeType(enum.Enum):
|
|
|
26
26
|
PVC = 'k8s-pvc'
|
|
27
27
|
RUNPOD_NETWORK_VOLUME = 'runpod-network-volume'
|
|
28
28
|
|
|
29
|
+
@classmethod
|
|
30
|
+
def supported_types(cls) -> list:
|
|
31
|
+
"""Return list of supported volume type values."""
|
|
32
|
+
return [vt.value for vt in cls]
|
|
33
|
+
|
|
29
34
|
|
|
30
35
|
class VolumeMount:
|
|
31
36
|
"""Volume mount specification."""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20251001
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -154,51 +154,51 @@ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
|
|
|
154
154
|
Requires-Dist: aiosqlite; extra == "server"
|
|
155
155
|
Requires-Dist: greenlet; extra == "server"
|
|
156
156
|
Provides-Extra: all
|
|
157
|
-
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
158
|
-
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
|
159
|
-
Requires-Dist: docker; extra == "all"
|
|
160
|
-
Requires-Dist: ibm-cos-sdk; extra == "all"
|
|
161
|
-
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
162
|
-
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
|
163
|
-
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
164
157
|
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
|
165
|
-
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
|
166
|
-
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
167
|
-
Requires-Dist: msrestazure; extra == "all"
|
|
168
|
-
Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
|
|
169
|
-
Requires-Dist: aiohttp; extra == "all"
|
|
170
|
-
Requires-Dist: colorama<0.4.5; extra == "all"
|
|
171
|
-
Requires-Dist: websockets; extra == "all"
|
|
172
|
-
Requires-Dist: ecsapi>=0.2.0; extra == "all"
|
|
173
|
-
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
174
|
-
Requires-Dist: tomli; python_version < "3.11" and extra == "all"
|
|
175
158
|
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
159
|
+
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
160
|
+
Requires-Dist: anyio; extra == "all"
|
|
176
161
|
Requires-Dist: casbin; extra == "all"
|
|
162
|
+
Requires-Dist: colorama<0.4.5; extra == "all"
|
|
163
|
+
Requires-Dist: ecsapi>=0.2.0; extra == "all"
|
|
164
|
+
Requires-Dist: google-cloud-storage; extra == "all"
|
|
165
|
+
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
|
177
166
|
Requires-Dist: ray[default]>=2.6.1; extra == "all"
|
|
178
|
-
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
179
|
-
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
180
|
-
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
181
167
|
Requires-Dist: ibm-vpc; extra == "all"
|
|
182
|
-
Requires-Dist:
|
|
168
|
+
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
169
|
+
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
183
170
|
Requires-Dist: passlib; extra == "all"
|
|
184
|
-
Requires-Dist:
|
|
185
|
-
Requires-Dist:
|
|
186
|
-
Requires-Dist:
|
|
187
|
-
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
171
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
172
|
+
Requires-Dist: ibm-cos-sdk; extra == "all"
|
|
173
|
+
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
|
188
174
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
|
189
|
-
Requires-Dist:
|
|
190
|
-
Requires-Dist: pyjwt; extra == "all"
|
|
191
|
-
Requires-Dist: msgraph-sdk; extra == "all"
|
|
192
|
-
Requires-Dist: azure-common; extra == "all"
|
|
175
|
+
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
193
176
|
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
|
194
177
|
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
|
195
|
-
Requires-Dist:
|
|
178
|
+
Requires-Dist: pyjwt; extra == "all"
|
|
179
|
+
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
|
180
|
+
Requires-Dist: msrestazure; extra == "all"
|
|
181
|
+
Requires-Dist: azure-common; extra == "all"
|
|
182
|
+
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
183
|
+
Requires-Dist: tomli; python_version < "3.11" and extra == "all"
|
|
184
|
+
Requires-Dist: websockets; extra == "all"
|
|
185
|
+
Requires-Dist: python-dateutil; extra == "all"
|
|
186
|
+
Requires-Dist: aiohttp; extra == "all"
|
|
187
|
+
Requires-Dist: greenlet; extra == "all"
|
|
188
|
+
Requires-Dist: docker; extra == "all"
|
|
189
|
+
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
190
|
+
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
191
|
+
Requires-Dist: msgraph-sdk; extra == "all"
|
|
192
|
+
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
196
193
|
Requires-Dist: ibm-cloud-sdk-core; extra == "all"
|
|
194
|
+
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
195
|
+
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
196
|
+
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
197
|
+
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
|
198
|
+
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
197
199
|
Requires-Dist: aiosqlite; extra == "all"
|
|
198
|
-
Requires-Dist:
|
|
199
|
-
Requires-Dist:
|
|
200
|
-
Requires-Dist: google-cloud-storage; extra == "all"
|
|
201
|
-
Requires-Dist: greenlet; extra == "all"
|
|
200
|
+
Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
|
|
201
|
+
Requires-Dist: oci; extra == "all"
|
|
202
202
|
Dynamic: author
|
|
203
203
|
Dynamic: classifier
|
|
204
204
|
Dynamic: description
|