skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py
CHANGED
|
@@ -100,7 +100,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
100
100
|
# cluster yaml is updated.
|
|
101
101
|
#
|
|
102
102
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
103
|
-
SKYLET_VERSION = '
|
|
103
|
+
SKYLET_VERSION = '25'
|
|
104
104
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
105
105
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
106
106
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
@@ -226,7 +226,9 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
226
226
|
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
|
227
227
|
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
|
|
228
228
|
f'|| {RAY_STATUS} || '
|
|
229
|
-
|
|
229
|
+
# The pydantic-core==2.41.3 for arm seems corrupted
|
|
230
|
+
# so we need to avoid that specific version.
|
|
231
|
+
f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
|
|
230
232
|
# In some envs, e.g. pip does not have permission to write under /opt/conda
|
|
231
233
|
# ray package will be installed under ~/.local/bin. If the user's PATH does
|
|
232
234
|
# not include ~/.local/bin (the pip install will have the output: `WARNING:
|
|
@@ -402,10 +404,27 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
402
404
|
]
|
|
403
405
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
404
406
|
# we skip the following keys because they are meant to be client-side configs.
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
407
|
+
# Also, we skip the consolidation mode config as those should be only set on
|
|
408
|
+
# the API server side.
|
|
409
|
+
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
|
|
410
|
+
('api_server',),
|
|
411
|
+
('allowed_clouds',),
|
|
412
|
+
('workspaces',),
|
|
413
|
+
('db',),
|
|
414
|
+
('daemons',),
|
|
415
|
+
# TODO(kevin,tian): Override the whole controller config once our test
|
|
416
|
+
# infrastructure supports setting dynamic server side configs.
|
|
417
|
+
# Tests that are affected:
|
|
418
|
+
# - test_managed_jobs_ha_kill_starting
|
|
419
|
+
# - test_managed_jobs_ha_kill_running
|
|
420
|
+
# - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
|
|
421
|
+
# LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
|
|
422
|
+
# but the configs won't be applied)
|
|
423
|
+
('jobs', 'controller', 'consolidation_mode'),
|
|
424
|
+
('serve', 'controller', 'consolidation_mode'),
|
|
425
|
+
('jobs', 'controller', 'controller_logs_gc_retention_hours'),
|
|
426
|
+
('jobs', 'controller', 'task_logs_gc_retention_hours'),
|
|
427
|
+
]
|
|
409
428
|
|
|
410
429
|
# Constants for Azure blob storage
|
|
411
430
|
WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
|
|
@@ -455,6 +474,7 @@ ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
|
|
|
455
474
|
# authentication is enabled in the API server.
|
|
456
475
|
ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
|
|
457
476
|
SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
|
|
477
|
+
SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
|
|
458
478
|
ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
|
|
459
479
|
|
|
460
480
|
# Enable debug logging for requests.
|
|
@@ -471,7 +491,7 @@ CATALOG_DIR = '~/.sky/catalogs'
|
|
|
471
491
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
472
492
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
|
473
493
|
'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
|
|
474
|
-
'hyperbolic', 'seeweb')
|
|
494
|
+
'hyperbolic', 'seeweb', 'shadeform')
|
|
475
495
|
# END constants used for service catalog.
|
|
476
496
|
|
|
477
497
|
# The user ID of the SkyPilot system.
|
|
@@ -531,3 +551,6 @@ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
|
|
|
531
551
|
|
|
532
552
|
ARM64_ARCH = 'arm64'
|
|
533
553
|
X86_64_ARCH = 'x86_64'
|
|
554
|
+
|
|
555
|
+
SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
|
|
556
|
+
f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
|
sky/skylet/events.py
CHANGED
|
@@ -326,8 +326,15 @@ class AutostopEvent(SkyletEvent):
|
|
|
326
326
|
cluster_name_on_cloud = cluster_config['cluster_name']
|
|
327
327
|
is_cluster_multinode = cluster_config['max_workers'] > 0
|
|
328
328
|
|
|
329
|
+
# Clear AWS credentials from environment to force boto3 to use IAM
|
|
330
|
+
# role attached to the instance (lowest priority in credential chain).
|
|
331
|
+
# This allows the cluster to stop/terminate itself using its IAM role.
|
|
329
332
|
os.environ.pop('AWS_ACCESS_KEY_ID', None)
|
|
330
333
|
os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
|
|
334
|
+
os.environ.pop('AWS_SESSION_TOKEN', None)
|
|
335
|
+
# Point boto3 to /dev/null to skip reading credentials from files.
|
|
336
|
+
os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
|
|
337
|
+
os.environ['AWS_CONFIG_FILE'] = '/dev/null'
|
|
331
338
|
|
|
332
339
|
# Stop the ray autoscaler to avoid scaling up, during
|
|
333
340
|
# stopping/terminating of the cluster.
|
sky/skylet/log_lib.py
CHANGED
|
@@ -220,7 +220,14 @@ def run_with_log(
|
|
|
220
220
|
stdin=stdin,
|
|
221
221
|
**kwargs) as proc:
|
|
222
222
|
try:
|
|
223
|
-
|
|
223
|
+
if ctx is not None:
|
|
224
|
+
# When runs in coroutine, use kill_pg if available to avoid
|
|
225
|
+
# the overhead of refreshing the process tree in the daemon.
|
|
226
|
+
subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
|
|
227
|
+
else:
|
|
228
|
+
# For backward compatibility, do not specify use_kill_pg by
|
|
229
|
+
# default.
|
|
230
|
+
subprocess_utils.kill_process_daemon(proc.pid)
|
|
224
231
|
stdout = ''
|
|
225
232
|
stderr = ''
|
|
226
233
|
stdout_stream_handler = None
|
|
@@ -271,7 +278,6 @@ def run_with_log(
|
|
|
271
278
|
stdout, stderr = context_utils.pipe_and_wait_process(
|
|
272
279
|
ctx,
|
|
273
280
|
proc,
|
|
274
|
-
cancel_callback=subprocess_utils.kill_children_processes,
|
|
275
281
|
stdout_stream_handler=stdout_stream_handler,
|
|
276
282
|
stderr_stream_handler=stderr_stream_handler)
|
|
277
283
|
elif process_stream:
|
sky/skylet/log_lib.pyi
CHANGED
sky/skylet/services.py
CHANGED
|
@@ -216,10 +216,12 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
216
216
|
if pool is not None:
|
|
217
217
|
pool_hash = serve_state.get_service_hash(pool)
|
|
218
218
|
# Add the managed job to job queue database.
|
|
219
|
+
user_id = managed_job.user_id if managed_job.HasField(
|
|
220
|
+
'user_id') else None
|
|
219
221
|
managed_job_state.set_job_info(job_id, managed_job.name,
|
|
220
222
|
managed_job.workspace,
|
|
221
223
|
managed_job.entrypoint, pool,
|
|
222
|
-
pool_hash)
|
|
224
|
+
pool_hash, user_id)
|
|
223
225
|
# Set the managed job to PENDING state to make sure that
|
|
224
226
|
# this managed job appears in the `sky jobs queue`, even
|
|
225
227
|
# if it needs to wait to be submitted.
|
|
@@ -405,18 +407,22 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
405
407
|
context: grpc.ServicerContext
|
|
406
408
|
) -> managed_jobsv1_pb2.GetJobTableResponse:
|
|
407
409
|
try:
|
|
408
|
-
accessible_workspaces =
|
|
409
|
-
|
|
410
|
+
accessible_workspaces = (
|
|
411
|
+
list(request.accessible_workspaces.workspaces)
|
|
412
|
+
if request.HasField('accessible_workspaces') else None)
|
|
413
|
+
job_ids = (list(request.job_ids.ids)
|
|
414
|
+
if request.HasField('job_ids') else None)
|
|
410
415
|
user_hashes: Optional[List[Optional[str]]] = None
|
|
411
|
-
if request.user_hashes:
|
|
416
|
+
if request.HasField('user_hashes'):
|
|
412
417
|
user_hashes = list(request.user_hashes.hashes)
|
|
413
418
|
# For backwards compatibility, we show jobs that do not have a
|
|
414
419
|
# user_hash. TODO: Remove before 0.12.0.
|
|
415
420
|
if request.show_jobs_without_user_hash:
|
|
416
421
|
user_hashes.append(None)
|
|
417
|
-
statuses = list(
|
|
418
|
-
|
|
419
|
-
|
|
422
|
+
statuses = (list(request.statuses.statuses)
|
|
423
|
+
if request.HasField('statuses') else None)
|
|
424
|
+
fields = (list(request.fields.fields)
|
|
425
|
+
if request.HasField('fields') else None)
|
|
420
426
|
job_queue = managed_job_utils.get_managed_job_queue(
|
|
421
427
|
skip_finished=request.skip_finished,
|
|
422
428
|
accessible_workspaces=accessible_workspaces,
|
|
@@ -430,7 +436,9 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
430
436
|
page=request.page if request.HasField('page') else None,
|
|
431
437
|
limit=request.limit if request.HasField('limit') else None,
|
|
432
438
|
user_hashes=user_hashes,
|
|
433
|
-
statuses=statuses
|
|
439
|
+
statuses=statuses,
|
|
440
|
+
fields=fields,
|
|
441
|
+
)
|
|
434
442
|
jobs = job_queue['jobs']
|
|
435
443
|
total = job_queue['total']
|
|
436
444
|
total_no_filter = job_queue['total_no_filter']
|
|
@@ -438,7 +446,16 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
438
446
|
|
|
439
447
|
jobs_info = []
|
|
440
448
|
for job in jobs:
|
|
449
|
+
converted_metadata = None
|
|
450
|
+
metadata = job.get('metadata')
|
|
451
|
+
if metadata:
|
|
452
|
+
converted_metadata = {
|
|
453
|
+
k: v for k, v in metadata.items() if v is not None
|
|
454
|
+
}
|
|
441
455
|
job_info = managed_jobsv1_pb2.ManagedJobInfo(
|
|
456
|
+
# The `spot.job_id`, which can be used to identify
|
|
457
|
+
# different tasks for the same job
|
|
458
|
+
_job_id=job.get('_job_id'),
|
|
442
459
|
job_id=job.get('job_id'),
|
|
443
460
|
task_id=job.get('task_id'),
|
|
444
461
|
job_name=job.get('job_name'),
|
|
@@ -466,11 +483,7 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
466
483
|
end_at=job.get('end_at'),
|
|
467
484
|
user_yaml=job.get('user_yaml'),
|
|
468
485
|
entrypoint=job.get('entrypoint'),
|
|
469
|
-
metadata=
|
|
470
|
-
k: v
|
|
471
|
-
for k, v in job.get('metadata', {}).items()
|
|
472
|
-
if v is not None
|
|
473
|
-
},
|
|
486
|
+
metadata=converted_metadata,
|
|
474
487
|
pool=job.get('pool'),
|
|
475
488
|
pool_hash=job.get('pool_hash'))
|
|
476
489
|
jobs_info.append(job_info)
|
sky/skylet/subprocess_daemon.py
CHANGED
|
@@ -4,11 +4,16 @@ processes of proc_pid.
|
|
|
4
4
|
"""
|
|
5
5
|
import argparse
|
|
6
6
|
import os
|
|
7
|
+
import signal
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
10
|
+
from typing import List, Optional
|
|
9
11
|
|
|
10
12
|
import psutil
|
|
11
13
|
|
|
14
|
+
# Environment variable to enable kill_pg in subprocess daemon.
|
|
15
|
+
USE_KILL_PG_ENV_VAR = 'SKYPILOT_SUBPROCESS_DAEMON_KILL_PG'
|
|
16
|
+
|
|
12
17
|
|
|
13
18
|
def daemonize():
|
|
14
19
|
"""Detaches the process from its parent process with double-forking.
|
|
@@ -38,8 +43,74 @@ def daemonize():
|
|
|
38
43
|
# This process is now fully detached from the original parent and terminal
|
|
39
44
|
|
|
40
45
|
|
|
41
|
-
|
|
42
|
-
|
|
46
|
+
def get_pgid_if_leader(pid) -> Optional[int]:
|
|
47
|
+
"""Get the process group ID of the target process if it is the leader."""
|
|
48
|
+
try:
|
|
49
|
+
pgid = os.getpgid(pid)
|
|
50
|
+
# Only use process group if the target process is the leader. This is
|
|
51
|
+
# to avoid killing the entire process group while the target process is
|
|
52
|
+
# just a subprocess in the group.
|
|
53
|
+
if pgid == pid:
|
|
54
|
+
print(f'Process group {pgid} is the leader.')
|
|
55
|
+
return pgid
|
|
56
|
+
return None
|
|
57
|
+
except Exception: # pylint: disable=broad-except
|
|
58
|
+
# Process group is only available in UNIX.
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def kill_process_group(pgid: int) -> bool:
|
|
63
|
+
"""Kill the target process group."""
|
|
64
|
+
try:
|
|
65
|
+
print(f'Terminating process group {pgid}...')
|
|
66
|
+
os.killpg(pgid, signal.SIGTERM)
|
|
67
|
+
except Exception: # pylint: disable=broad-except
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
# Wait 30s for the process group to exit gracefully.
|
|
71
|
+
time.sleep(30)
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
print(f'Force killing process group {pgid}...')
|
|
75
|
+
os.killpg(pgid, signal.SIGKILL)
|
|
76
|
+
except Exception: # pylint: disable=broad-except
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def kill_process_tree(process: psutil.Process,
|
|
83
|
+
children: List[psutil.Process]) -> bool:
|
|
84
|
+
"""Kill the process tree of the target process."""
|
|
85
|
+
if process is not None:
|
|
86
|
+
# Kill the target process first to avoid having more children, or fail
|
|
87
|
+
# the process due to the children being defunct.
|
|
88
|
+
children = [process] + children
|
|
89
|
+
|
|
90
|
+
if not children:
|
|
91
|
+
sys.exit()
|
|
92
|
+
|
|
93
|
+
for child in children:
|
|
94
|
+
try:
|
|
95
|
+
child.terminate()
|
|
96
|
+
except psutil.NoSuchProcess:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Wait 30s for the processes to exit gracefully.
|
|
100
|
+
time.sleep(30)
|
|
101
|
+
|
|
102
|
+
# SIGKILL if they're still running.
|
|
103
|
+
for child in children:
|
|
104
|
+
try:
|
|
105
|
+
child.kill()
|
|
106
|
+
except psutil.NoSuchProcess:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def main():
|
|
113
|
+
# daemonize()
|
|
43
114
|
parser = argparse.ArgumentParser()
|
|
44
115
|
parser.add_argument('--parent-pid', type=int, required=True)
|
|
45
116
|
parser.add_argument('--proc-pid', type=int, required=True)
|
|
@@ -72,37 +143,40 @@ if __name__ == '__main__':
|
|
|
72
143
|
except (psutil.NoSuchProcess, ValueError):
|
|
73
144
|
pass
|
|
74
145
|
|
|
146
|
+
pgid: Optional[int] = None
|
|
147
|
+
if os.environ.get(USE_KILL_PG_ENV_VAR) == '1':
|
|
148
|
+
# Use kill_pg on UNIX system if allowed to reduce the resource usage.
|
|
149
|
+
# Note that both implementations might leave subprocessed uncancelled:
|
|
150
|
+
# - kill_process_tree(default): a subprocess is able to detach itself
|
|
151
|
+
# from the process tree use the same technique as daemonize(). Also,
|
|
152
|
+
# since we refresh the process tree per second, if the subprocess is
|
|
153
|
+
# launched between the [last_poll, parent_die] interval, the
|
|
154
|
+
# subprocess will not be captured will not be killed.
|
|
155
|
+
# - kill_process_group: kill_pg will kill all the processed in the group
|
|
156
|
+
# but if a subprocess calls setpgid(0, 0) to detach itself from the
|
|
157
|
+
# process group (usually to daemonize itself), the subprocess will
|
|
158
|
+
# not be killed.
|
|
159
|
+
pgid = get_pgid_if_leader(process.pid)
|
|
160
|
+
|
|
75
161
|
if process is not None and parent_process is not None:
|
|
76
162
|
# Wait for either parent or target process to exit
|
|
77
163
|
while process.is_running() and parent_process.is_running():
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
164
|
+
if pgid is None:
|
|
165
|
+
# Refresh process tree for cleanup if process group is not
|
|
166
|
+
# available.
|
|
167
|
+
try:
|
|
168
|
+
tmp_children = process.children(recursive=True)
|
|
169
|
+
if tmp_children:
|
|
170
|
+
children = tmp_children
|
|
171
|
+
except psutil.NoSuchProcess:
|
|
172
|
+
pass
|
|
84
173
|
time.sleep(1)
|
|
85
174
|
|
|
86
|
-
if
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
175
|
+
if pgid is not None:
|
|
176
|
+
kill_process_group(pgid)
|
|
177
|
+
else:
|
|
178
|
+
kill_process_tree(process, children)
|
|
90
179
|
|
|
91
|
-
if not children:
|
|
92
|
-
sys.exit()
|
|
93
180
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
child.terminate()
|
|
97
|
-
except psutil.NoSuchProcess:
|
|
98
|
-
continue
|
|
99
|
-
|
|
100
|
-
# Wait 30s for the processes to exit gracefully.
|
|
101
|
-
time.sleep(30)
|
|
102
|
-
|
|
103
|
-
# SIGKILL if they're still running.
|
|
104
|
-
for child in children:
|
|
105
|
-
try:
|
|
106
|
-
child.kill()
|
|
107
|
-
except psutil.NoSuchProcess:
|
|
108
|
-
continue
|
|
181
|
+
if __name__ == '__main__':
|
|
182
|
+
main()
|
sky/skypilot_config.py
CHANGED
|
@@ -64,7 +64,6 @@ from sqlalchemy import orm
|
|
|
64
64
|
from sqlalchemy.dialects import postgresql
|
|
65
65
|
from sqlalchemy.dialects import sqlite
|
|
66
66
|
from sqlalchemy.ext import declarative
|
|
67
|
-
from sqlalchemy.pool import NullPool
|
|
68
67
|
|
|
69
68
|
from sky import exceptions
|
|
70
69
|
from sky import sky_logging
|
|
@@ -77,6 +76,7 @@ from sky.utils import schemas
|
|
|
77
76
|
from sky.utils import ux_utils
|
|
78
77
|
from sky.utils import yaml_utils
|
|
79
78
|
from sky.utils.db import db_utils
|
|
79
|
+
from sky.utils.db import migration_utils
|
|
80
80
|
from sky.utils.kubernetes import config_map_utils
|
|
81
81
|
|
|
82
82
|
if typing.TYPE_CHECKING:
|
|
@@ -121,7 +121,8 @@ _PROJECT_CONFIG_PATH = '.sky.yaml'
|
|
|
121
121
|
|
|
122
122
|
API_SERVER_CONFIG_KEY = 'api_server_config'
|
|
123
123
|
|
|
124
|
-
|
|
124
|
+
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
125
|
+
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
125
126
|
|
|
126
127
|
Base = declarative.declarative_base()
|
|
127
128
|
|
|
@@ -481,7 +482,7 @@ def safe_reload_config() -> None:
|
|
|
481
482
|
reload_config()
|
|
482
483
|
|
|
483
484
|
|
|
484
|
-
def reload_config() -> None:
|
|
485
|
+
def reload_config(init_db: bool = False) -> None:
|
|
485
486
|
internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
|
|
486
487
|
if internal_config_path is not None:
|
|
487
488
|
# {ENV_VAR_SKYPILOT_CONFIG} is used internally.
|
|
@@ -493,7 +494,7 @@ def reload_config() -> None:
|
|
|
493
494
|
return
|
|
494
495
|
|
|
495
496
|
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
496
|
-
_reload_config_as_server()
|
|
497
|
+
_reload_config_as_server(init_db=init_db)
|
|
497
498
|
else:
|
|
498
499
|
_reload_config_as_client()
|
|
499
500
|
|
|
@@ -564,7 +565,43 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
|
|
564
565
|
_set_loaded_config_path(config_path)
|
|
565
566
|
|
|
566
567
|
|
|
567
|
-
def
|
|
568
|
+
def _create_table(engine: sqlalchemy.engine.Engine):
|
|
569
|
+
"""Initialize the config database with migrations."""
|
|
570
|
+
migration_utils.safe_alembic_upgrade(
|
|
571
|
+
engine, migration_utils.SKYPILOT_CONFIG_DB_NAME,
|
|
572
|
+
migration_utils.SKYPILOT_CONFIG_VERSION)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
576
|
+
"""Initialize and return the config database engine.
|
|
577
|
+
|
|
578
|
+
This function should only be called by the API Server during initialization.
|
|
579
|
+
Client-side code should never call this function.
|
|
580
|
+
"""
|
|
581
|
+
assert os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None, (
|
|
582
|
+
'initialize_and_get_db() can only be called by the API Server')
|
|
583
|
+
|
|
584
|
+
global _SQLALCHEMY_ENGINE
|
|
585
|
+
|
|
586
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
587
|
+
return _SQLALCHEMY_ENGINE
|
|
588
|
+
|
|
589
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
590
|
+
if _SQLALCHEMY_ENGINE is not None:
|
|
591
|
+
return _SQLALCHEMY_ENGINE
|
|
592
|
+
|
|
593
|
+
# We only store config in the DB when using Postgres,
|
|
594
|
+
# so no need to pass in db_name here.
|
|
595
|
+
engine = db_utils.get_engine(None)
|
|
596
|
+
|
|
597
|
+
# Run migrations if needed
|
|
598
|
+
_create_table(engine)
|
|
599
|
+
|
|
600
|
+
_SQLALCHEMY_ENGINE = engine
|
|
601
|
+
return _SQLALCHEMY_ENGINE
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _reload_config_as_server(init_db: bool = False) -> None:
|
|
568
605
|
# Reset the global variables, to avoid using stale values.
|
|
569
606
|
_set_loaded_config(config_utils.Config())
|
|
570
607
|
_set_loaded_config_path(None)
|
|
@@ -580,37 +617,24 @@ def _reload_config_as_server() -> None:
|
|
|
580
617
|
raise ValueError(
|
|
581
618
|
'If db config is specified, no other config is allowed')
|
|
582
619
|
logger.debug('retrieving config from database')
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
db_config = config_utils.Config(
|
|
602
|
-
yaml_utils.safe_load(row.value))
|
|
603
|
-
db_config.pop_nested(('db',), None)
|
|
604
|
-
return db_config
|
|
605
|
-
return None
|
|
606
|
-
|
|
607
|
-
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
608
|
-
if db_config:
|
|
609
|
-
server_config = overlay_skypilot_config(server_config,
|
|
610
|
-
db_config)
|
|
611
|
-
# Close the engine to avoid connection leaks
|
|
612
|
-
if dispose_engine:
|
|
613
|
-
sqlalchemy_engine.dispose()
|
|
620
|
+
|
|
621
|
+
if init_db:
|
|
622
|
+
_initialize_and_get_db()
|
|
623
|
+
|
|
624
|
+
def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
|
|
625
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
626
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
627
|
+
row = session.query(config_yaml_table).filter_by(
|
|
628
|
+
key=key).first()
|
|
629
|
+
if row:
|
|
630
|
+
db_config = config_utils.Config(yaml_utils.safe_load(row.value))
|
|
631
|
+
db_config.pop_nested(('db',), None)
|
|
632
|
+
return db_config
|
|
633
|
+
return None
|
|
634
|
+
|
|
635
|
+
db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
|
|
636
|
+
if db_config:
|
|
637
|
+
server_config = overlay_skypilot_config(server_config, db_config)
|
|
614
638
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
615
639
|
logger.debug(f'server config: \n'
|
|
616
640
|
f'{yaml_utils.dump_yaml_str(dict(server_config))}')
|
|
@@ -666,7 +690,7 @@ def loaded_config_path_serialized() -> Optional[str]:
|
|
|
666
690
|
|
|
667
691
|
|
|
668
692
|
# Load on import, synchronization is guaranteed by python interpreter.
|
|
669
|
-
reload_config()
|
|
693
|
+
reload_config(init_db=True)
|
|
670
694
|
|
|
671
695
|
|
|
672
696
|
def loaded() -> bool:
|
|
@@ -880,44 +904,32 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
880
904
|
if new_db_url and new_db_url != existing_db_url:
|
|
881
905
|
raise ValueError('Cannot change db url while server is running')
|
|
882
906
|
if existing_db_url:
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
910
|
-
index_elements=[config_yaml_table.c.key],
|
|
911
|
-
set_={config_yaml_table.c.value: config_str})
|
|
912
|
-
session.execute(do_update_stmt)
|
|
913
|
-
session.commit()
|
|
914
|
-
|
|
915
|
-
logger.debug('saving api_server config to db')
|
|
916
|
-
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
917
|
-
db_updated = True
|
|
918
|
-
# Close the engine to avoid connection leaks
|
|
919
|
-
if dispose_engine:
|
|
920
|
-
sqlalchemy_engine.dispose()
|
|
907
|
+
|
|
908
|
+
def _set_config_yaml_to_db(key: str, config: config_utils.Config):
|
|
909
|
+
# reload_config(init_db=True) is called when this module is
|
|
910
|
+
# imported, so the database engine must already be initialized.
|
|
911
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
912
|
+
config_str = yaml_utils.dump_yaml_str(dict(config))
|
|
913
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
914
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
915
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
916
|
+
insert_func = sqlite.insert
|
|
917
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
918
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
919
|
+
insert_func = postgresql.insert
|
|
920
|
+
else:
|
|
921
|
+
raise ValueError('Unsupported database dialect')
|
|
922
|
+
insert_stmnt = insert_func(config_yaml_table).values(
|
|
923
|
+
key=key, value=config_str)
|
|
924
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
|
925
|
+
index_elements=[config_yaml_table.c.key],
|
|
926
|
+
set_={config_yaml_table.c.value: config_str})
|
|
927
|
+
session.execute(do_update_stmt)
|
|
928
|
+
session.commit()
|
|
929
|
+
|
|
930
|
+
logger.debug('saving api_server config to db')
|
|
931
|
+
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
932
|
+
db_updated = True
|
|
921
933
|
|
|
922
934
|
if not db_updated:
|
|
923
935
|
# save to the local file (PVC in Kubernetes, local file otherwise)
|
sky/ssh_node_pools/server.py
CHANGED
|
@@ -7,6 +7,7 @@ import fastapi
|
|
|
7
7
|
from sky import core as sky_core
|
|
8
8
|
from sky.server.requests import executor
|
|
9
9
|
from sky.server.requests import payloads
|
|
10
|
+
from sky.server.requests import request_names
|
|
10
11
|
from sky.server.requests import requests as requests_lib
|
|
11
12
|
from sky.ssh_node_pools import core as ssh_node_pools_core
|
|
12
13
|
from sky.utils import common_utils
|
|
@@ -99,9 +100,9 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
|
|
|
99
100
|
"""Deploy SSH Node Pool using existing ssh_up functionality."""
|
|
100
101
|
try:
|
|
101
102
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
|
|
102
|
-
executor.
|
|
103
|
+
await executor.schedule_request_async(
|
|
103
104
|
request_id=request.state.request_id,
|
|
104
|
-
request_name=
|
|
105
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
105
106
|
request_body=ssh_up_body,
|
|
106
107
|
func=sky_core.ssh_up,
|
|
107
108
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -124,9 +125,9 @@ async def deploy_ssh_node_pool_general(
|
|
|
124
125
|
ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
|
|
125
126
|
"""Deploys all SSH Node Pools."""
|
|
126
127
|
try:
|
|
127
|
-
executor.
|
|
128
|
+
await executor.schedule_request_async(
|
|
128
129
|
request_id=request.state.request_id,
|
|
129
|
-
request_name=
|
|
130
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
130
131
|
request_body=ssh_up_body,
|
|
131
132
|
func=sky_core.ssh_up,
|
|
132
133
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -150,9 +151,9 @@ async def down_ssh_node_pool(request: fastapi.Request,
|
|
|
150
151
|
"""Cleans up a SSH Node Pools."""
|
|
151
152
|
try:
|
|
152
153
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
|
|
153
|
-
executor.
|
|
154
|
+
await executor.schedule_request_async(
|
|
154
155
|
request_id=request.state.request_id,
|
|
155
|
-
request_name=
|
|
156
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
156
157
|
request_body=ssh_up_body,
|
|
157
158
|
func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
158
159
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -178,9 +179,9 @@ async def down_ssh_node_pool_general(
|
|
|
178
179
|
try:
|
|
179
180
|
# Set cleanup=True for down operation
|
|
180
181
|
ssh_up_body.cleanup = True
|
|
181
|
-
executor.
|
|
182
|
+
await executor.schedule_request_async(
|
|
182
183
|
request_id=request.state.request_id,
|
|
183
|
-
request_name=
|
|
184
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
184
185
|
request_body=ssh_up_body,
|
|
185
186
|
func=sky_core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
186
187
|
schedule_type=requests_lib.ScheduleType.LONG,
|