skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/task.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Task: a coarse-grained stage in an application."""
|
|
2
2
|
import collections
|
|
3
|
-
import inspect
|
|
4
3
|
import json
|
|
5
4
|
import os
|
|
6
5
|
import re
|
|
@@ -29,10 +28,6 @@ from sky.utils import yaml_utils
|
|
|
29
28
|
|
|
30
29
|
logger = sky_logging.init_logger(__name__)
|
|
31
30
|
|
|
32
|
-
# A lambda generating commands (node rank_i, node addrs -> cmd_i).
|
|
33
|
-
CommandGen = Callable[[int, List[str]], Optional[str]]
|
|
34
|
-
CommandOrCommandGen = Union[str, CommandGen]
|
|
35
|
-
|
|
36
31
|
_VALID_NAME_REGEX = '[a-zA-Z0-9]+(?:[._-]{1,2}[a-zA-Z0-9]+)*'
|
|
37
32
|
_VALID_NAME_DESCR = ('ASCII characters and may contain lowercase and'
|
|
38
33
|
' uppercase letters, digits, underscores, periods,'
|
|
@@ -236,7 +231,7 @@ class Task:
|
|
|
236
231
|
name: Optional[str] = None,
|
|
237
232
|
*,
|
|
238
233
|
setup: Optional[Union[str, List[str]]] = None,
|
|
239
|
-
run: Optional[Union[
|
|
234
|
+
run: Optional[Union[str, List[str]]] = None,
|
|
240
235
|
envs: Optional[Dict[str, str]] = None,
|
|
241
236
|
secrets: Optional[Dict[str, str]] = None,
|
|
242
237
|
workdir: Optional[Union[str, Dict[str, Any]]] = None,
|
|
@@ -349,7 +344,7 @@ class Task:
|
|
|
349
344
|
self._volumes = volumes or {}
|
|
350
345
|
|
|
351
346
|
# concatenate commands if given as list
|
|
352
|
-
def _concat(commands):
|
|
347
|
+
def _concat(commands: Optional[Union[str, List[str]]]) -> Optional[str]:
|
|
353
348
|
if isinstance(commands, list):
|
|
354
349
|
return '\n'.join(commands)
|
|
355
350
|
return commands
|
|
@@ -447,42 +442,9 @@ class Task:
|
|
|
447
442
|
|
|
448
443
|
def validate_run(self):
|
|
449
444
|
"""Validates if the run command is valid."""
|
|
450
|
-
if
|
|
451
|
-
run_sig = inspect.signature(self.run)
|
|
452
|
-
# Check that run is a function with 2 arguments.
|
|
453
|
-
if len(run_sig.parameters) != 2:
|
|
454
|
-
with ux_utils.print_exception_no_traceback():
|
|
455
|
-
raise ValueError(_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
|
|
456
|
-
|
|
457
|
-
type_list = [int, List[str]]
|
|
458
|
-
# Check annotations, if exists
|
|
459
|
-
for i, param in enumerate(run_sig.parameters.values()):
|
|
460
|
-
if param.annotation != inspect.Parameter.empty:
|
|
461
|
-
if param.annotation != type_list[i]:
|
|
462
|
-
with ux_utils.print_exception_no_traceback():
|
|
463
|
-
raise ValueError(
|
|
464
|
-
_RUN_FN_CHECK_FAIL_MSG.format(run_sig))
|
|
465
|
-
|
|
466
|
-
# Check self containedness.
|
|
467
|
-
run_closure = inspect.getclosurevars(self.run)
|
|
468
|
-
if run_closure.nonlocals:
|
|
469
|
-
with ux_utils.print_exception_no_traceback():
|
|
470
|
-
raise ValueError(
|
|
471
|
-
'run command generator must be self contained. '
|
|
472
|
-
f'Found nonlocals: {run_closure.nonlocals}')
|
|
473
|
-
if run_closure.globals:
|
|
474
|
-
with ux_utils.print_exception_no_traceback():
|
|
475
|
-
raise ValueError(
|
|
476
|
-
'run command generator must be self contained. '
|
|
477
|
-
f'Found globals: {run_closure.globals}')
|
|
478
|
-
if run_closure.unbound:
|
|
479
|
-
# Do not raise an error here. Import statements, which are
|
|
480
|
-
# allowed, will be considered as unbounded.
|
|
481
|
-
pass
|
|
482
|
-
elif self.run is not None and not isinstance(self.run, str):
|
|
445
|
+
if self.run is not None and not isinstance(self.run, str):
|
|
483
446
|
with ux_utils.print_exception_no_traceback():
|
|
484
|
-
raise ValueError('run must be
|
|
485
|
-
f'a command generator ({CommandGen}). '
|
|
447
|
+
raise ValueError('run must be a shell script (str). '
|
|
486
448
|
f'Got {type(self.run)}')
|
|
487
449
|
|
|
488
450
|
def expand_and_validate_file_mounts(self):
|
|
@@ -1130,7 +1092,7 @@ class Task:
|
|
|
1130
1092
|
def set_resources(
|
|
1131
1093
|
self, resources: Union['resources_lib.Resources',
|
|
1132
1094
|
List['resources_lib.Resources'],
|
|
1133
|
-
Set['resources_lib.Resources']]
|
|
1095
|
+
Set['resources_lib.Resources'], Dict[str, Any]]
|
|
1134
1096
|
) -> 'Task':
|
|
1135
1097
|
"""Sets the required resources to execute this task.
|
|
1136
1098
|
|
|
@@ -1144,7 +1106,9 @@ class Task:
|
|
|
1144
1106
|
Returns:
|
|
1145
1107
|
self: The current task, with resources set.
|
|
1146
1108
|
"""
|
|
1147
|
-
if isinstance(resources,
|
|
1109
|
+
if isinstance(resources, dict):
|
|
1110
|
+
resources = resources_lib.Resources.from_yaml_config(resources)
|
|
1111
|
+
elif isinstance(resources, resources_lib.Resources):
|
|
1148
1112
|
resources = {resources}
|
|
1149
1113
|
# TODO(woosuk): Check if the resources are None.
|
|
1150
1114
|
self.resources = _with_docker_login_config(resources, self.envs,
|
|
@@ -1172,6 +1136,10 @@ class Task:
|
|
|
1172
1136
|
self.set_resources(type(self.resources)(new_resources_list))
|
|
1173
1137
|
return self
|
|
1174
1138
|
|
|
1139
|
+
def get_resource_config(self) -> Dict[str, Any]:
|
|
1140
|
+
return _resources_to_config(self.resources,
|
|
1141
|
+
factor_out_common_fields=True)
|
|
1142
|
+
|
|
1175
1143
|
@property
|
|
1176
1144
|
def service(self) -> Optional[service_spec.SkyServiceSpec]:
|
|
1177
1145
|
return self._service
|
|
@@ -1552,6 +1520,16 @@ class Task:
|
|
|
1552
1520
|
self.update_file_mounts({
|
|
1553
1521
|
mnt_path: blob_path,
|
|
1554
1522
|
})
|
|
1523
|
+
elif store_type is storage_lib.StoreType.COREWEAVE:
|
|
1524
|
+
if storage.source is not None and not isinstance(
|
|
1525
|
+
storage.source,
|
|
1526
|
+
list) and storage.source.startswith('cw://'):
|
|
1527
|
+
blob_path = storage.source
|
|
1528
|
+
else:
|
|
1529
|
+
blob_path = 'cw://' + storage.name
|
|
1530
|
+
self.update_file_mounts({
|
|
1531
|
+
mnt_path: blob_path,
|
|
1532
|
+
})
|
|
1555
1533
|
else:
|
|
1556
1534
|
with ux_utils.print_exception_no_traceback():
|
|
1557
1535
|
raise ValueError(f'Storage Type {store_type} '
|
|
@@ -1688,16 +1666,7 @@ class Task:
|
|
|
1688
1666
|
|
|
1689
1667
|
add_if_not_none('name', self.name)
|
|
1690
1668
|
|
|
1691
|
-
tmp_resource_config
|
|
1692
|
-
Dict[str, List[Dict[str, Union[str, int]]]]]
|
|
1693
|
-
if len(self.resources) > 1:
|
|
1694
|
-
resource_list = []
|
|
1695
|
-
for r in self.resources:
|
|
1696
|
-
resource_list.append(r.to_yaml_config())
|
|
1697
|
-
key = 'ordered' if isinstance(self.resources, list) else 'any_of'
|
|
1698
|
-
tmp_resource_config = {key: resource_list}
|
|
1699
|
-
else:
|
|
1700
|
-
tmp_resource_config = list(self.resources)[0].to_yaml_config()
|
|
1669
|
+
tmp_resource_config = _resources_to_config(self.resources)
|
|
1701
1670
|
|
|
1702
1671
|
add_if_not_none('resources', tmp_resource_config)
|
|
1703
1672
|
|
|
@@ -1810,3 +1779,47 @@ class Task:
|
|
|
1810
1779
|
else:
|
|
1811
1780
|
s += '\n resources: default instances'
|
|
1812
1781
|
return s
|
|
1782
|
+
|
|
1783
|
+
|
|
1784
|
+
def _resources_to_config(
|
|
1785
|
+
resources: Union[List['resources_lib.Resources'],
|
|
1786
|
+
Set['resources_lib.Resources']],
|
|
1787
|
+
factor_out_common_fields: bool = False) -> Dict[str, Any]:
|
|
1788
|
+
if len(resources) > 1:
|
|
1789
|
+
resource_list: List[Dict[str, Union[str, int]]] = []
|
|
1790
|
+
for r in resources:
|
|
1791
|
+
resource_list.append(r.to_yaml_config())
|
|
1792
|
+
group_key = 'ordered' if isinstance(resources, list) else 'any_of'
|
|
1793
|
+
if factor_out_common_fields:
|
|
1794
|
+
return _factor_out_common_resource_fields(resource_list, group_key)
|
|
1795
|
+
return {group_key: resource_list}
|
|
1796
|
+
else:
|
|
1797
|
+
return list(resources)[0].to_yaml_config()
|
|
1798
|
+
|
|
1799
|
+
|
|
1800
|
+
def _factor_out_common_resource_fields(configs: List[Dict[str, Union[str,
|
|
1801
|
+
int]]],
|
|
1802
|
+
group_key: str) -> Dict[str, Any]:
|
|
1803
|
+
"""Factors out the fields that are common to all resources."""
|
|
1804
|
+
return_config: Dict[str, Any] = configs[0].copy()
|
|
1805
|
+
if len(configs) > 1:
|
|
1806
|
+
for config in configs[1:]:
|
|
1807
|
+
for key, value in config.items():
|
|
1808
|
+
if key in return_config and return_config[key] != value:
|
|
1809
|
+
del return_config[key]
|
|
1810
|
+
num_empty_configs = 0
|
|
1811
|
+
for config in configs:
|
|
1812
|
+
keys_to_delete = []
|
|
1813
|
+
for key, value in config.items():
|
|
1814
|
+
if key in return_config:
|
|
1815
|
+
keys_to_delete.append(key)
|
|
1816
|
+
for key in keys_to_delete:
|
|
1817
|
+
del config[key]
|
|
1818
|
+
if not config:
|
|
1819
|
+
num_empty_configs += 1
|
|
1820
|
+
|
|
1821
|
+
if num_empty_configs == len(configs):
|
|
1822
|
+
return return_config
|
|
1823
|
+
if len(configs) > 0:
|
|
1824
|
+
return_config[group_key] = configs
|
|
1825
|
+
return return_config
|
|
@@ -209,7 +209,9 @@ provider:
|
|
|
209
209
|
metadata:
|
|
210
210
|
labels:
|
|
211
211
|
parent: skypilot
|
|
212
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
212
213
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
214
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
213
215
|
skypilot-user: {{ user }}
|
|
214
216
|
name: {{cluster_name_on_cloud}}-head-ssh
|
|
215
217
|
spec:
|
|
@@ -227,7 +229,9 @@ provider:
|
|
|
227
229
|
metadata:
|
|
228
230
|
labels:
|
|
229
231
|
parent: skypilot
|
|
232
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
230
233
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
234
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
231
235
|
skypilot-user: {{ user }}
|
|
232
236
|
# NOTE: If you're running multiple Ray clusters with services
|
|
233
237
|
# on one Kubernetes cluster, they must have unique service
|
|
@@ -247,7 +251,9 @@ provider:
|
|
|
247
251
|
metadata:
|
|
248
252
|
labels:
|
|
249
253
|
parent: skypilot
|
|
254
|
+
# TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
250
255
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
256
|
+
skypilot-cluster-name: {{cluster_name_on_cloud}}
|
|
251
257
|
skypilot-user: {{ user }}
|
|
252
258
|
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
|
253
259
|
spec:
|
|
@@ -272,6 +278,7 @@ available_node_types:
|
|
|
272
278
|
labels:
|
|
273
279
|
parent: skypilot
|
|
274
280
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
|
281
|
+
# TODO (kyuds): remove this label for v0.11.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
|
|
275
282
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
|
276
283
|
skypilot-user: {{ user }}
|
|
277
284
|
# Custom tags for the pods
|
|
@@ -1059,7 +1066,7 @@ available_node_types:
|
|
|
1059
1066
|
# Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
|
|
1060
1067
|
# For SkyServe, this will be None and every service will be recovered. This is because SkyServe
|
|
1061
1068
|
# will delete the service from the database after it is terminated so everything in the database is running.
|
|
1062
|
-
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs = state.
|
|
1069
|
+
ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
|
|
1063
1070
|
if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
|
|
1064
1071
|
read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
|
|
1065
1072
|
fi
|
sky/templates/nebius-ray.yml.j2
CHANGED
|
@@ -156,6 +156,7 @@ setup_commands:
|
|
|
156
156
|
echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
|
|
157
157
|
{%- endfor %}
|
|
158
158
|
{%- endif %}
|
|
159
|
+
IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
|
|
159
160
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
160
161
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
161
162
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
2
|
+
|
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
|
4
|
+
max_workers: {{num_nodes - 1}}
|
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
|
6
|
+
idle_timeout_minutes: 60
|
|
7
|
+
|
|
8
|
+
provider:
|
|
9
|
+
type: external
|
|
10
|
+
module: sky.provision.shadeform
|
|
11
|
+
region: "{{region}}"
|
|
12
|
+
disable_launch_config_check: true
|
|
13
|
+
|
|
14
|
+
auth:
|
|
15
|
+
ssh_user: shadeform
|
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
|
17
|
+
ssh_key_id: {{ssh_key_id}}
|
|
18
|
+
|
|
19
|
+
available_node_types:
|
|
20
|
+
ray_head_default:
|
|
21
|
+
{%- if custom_resources %}
|
|
22
|
+
resources: {{custom_resources}}
|
|
23
|
+
{%- else %}
|
|
24
|
+
resources: {}
|
|
25
|
+
{%- endif %}
|
|
26
|
+
node_config:
|
|
27
|
+
InstanceType: {{instance_type}}
|
|
28
|
+
PublicKey: |-
|
|
29
|
+
skypilot:ssh_public_key_content
|
|
30
|
+
|
|
31
|
+
head_node_type: ray_head_default
|
|
32
|
+
|
|
33
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
34
|
+
file_mounts: {
|
|
35
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
36
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
37
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
38
|
+
"{{remote_path}}": "{{local_path}}",
|
|
39
|
+
{%- endfor %}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
rsync_exclude: []
|
|
43
|
+
|
|
44
|
+
initialization_commands: []
|
|
45
|
+
|
|
46
|
+
# List of shell commands to run to set up nodes.
|
|
47
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
48
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
49
|
+
# items!
|
|
50
|
+
#
|
|
51
|
+
# Increment the following for catching performance bugs easier:
|
|
52
|
+
# current num items (num SSH connections): 1
|
|
53
|
+
setup_commands:
|
|
54
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
|
55
|
+
# Line 'rm ..': there is another installation of pip.
|
|
56
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
|
57
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
|
58
|
+
# Line 'mkdir -p ..': disable host key check
|
|
59
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
|
60
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
61
|
+
{{ initial_setup_command }}
|
|
62
|
+
{%- endfor %}
|
|
63
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
|
|
64
|
+
{{ conda_installation_commands }}
|
|
65
|
+
{{ ray_skypilot_installation_commands }}
|
|
66
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
67
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
68
|
+
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
|
69
|
+
{{ ssh_max_sessions_config }}
|
|
70
|
+
|
|
71
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
|
72
|
+
# We do not need to list it here anymore.
|
sky/templates/websocket_proxy.py
CHANGED
|
@@ -11,15 +11,23 @@ This script is useful for users who do not have local Kubernetes credentials.
|
|
|
11
11
|
import asyncio
|
|
12
12
|
from http.cookiejar import MozillaCookieJar
|
|
13
13
|
import os
|
|
14
|
+
import struct
|
|
14
15
|
import sys
|
|
15
|
-
|
|
16
|
+
import time
|
|
17
|
+
from typing import Dict, Optional
|
|
16
18
|
from urllib.request import Request
|
|
17
19
|
|
|
20
|
+
import requests
|
|
18
21
|
import websockets
|
|
19
22
|
from websockets.asyncio.client import ClientConnection
|
|
20
23
|
from websockets.asyncio.client import connect
|
|
21
24
|
|
|
25
|
+
from sky.server import constants
|
|
26
|
+
from sky.server.server import KubernetesSSHMessageType
|
|
27
|
+
from sky.skylet import constants as skylet_constants
|
|
28
|
+
|
|
22
29
|
BUFFER_SIZE = 2**16 # 64KB
|
|
30
|
+
HEARTBEAT_INTERVAL_SECONDS = 10
|
|
23
31
|
|
|
24
32
|
# Environment variable for a file path to the API cookie file.
|
|
25
33
|
# Keep in sync with server/constants.py
|
|
@@ -28,6 +36,8 @@ API_COOKIE_FILE_ENV_VAR = 'SKYPILOT_API_COOKIE_FILE'
|
|
|
28
36
|
# Keep in sync with server/constants.py
|
|
29
37
|
API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
|
|
30
38
|
|
|
39
|
+
MAX_UNANSWERED_PINGS = 100
|
|
40
|
+
|
|
31
41
|
|
|
32
42
|
def _get_cookie_header(url: str) -> Dict[str, str]:
|
|
33
43
|
"""Extract Cookie header value from a cookie jar for a specific URL"""
|
|
@@ -49,7 +59,7 @@ def _get_cookie_header(url: str) -> Dict[str, str]:
|
|
|
49
59
|
return {'Cookie': cookie_header}
|
|
50
60
|
|
|
51
61
|
|
|
52
|
-
async def main(url: str) -> None:
|
|
62
|
+
async def main(url: str, timestamps_supported: bool) -> None:
|
|
53
63
|
cookie_header = _get_cookie_header(url)
|
|
54
64
|
async with connect(url,
|
|
55
65
|
ping_interval=None,
|
|
@@ -75,45 +85,149 @@ async def main(url: str) -> None:
|
|
|
75
85
|
asyncio.streams.FlowControlMixin, sys.stdout) # type: ignore
|
|
76
86
|
stdout_writer = asyncio.StreamWriter(transport, protocol, None,
|
|
77
87
|
loop)
|
|
88
|
+
# Dictionary to store last ping time for latency measurement
|
|
89
|
+
last_ping_time_dict: Optional[Dict[int, float]] = None
|
|
90
|
+
if timestamps_supported:
|
|
91
|
+
last_ping_time_dict = {}
|
|
92
|
+
|
|
93
|
+
# Use an Event to signal when websocket is closed
|
|
94
|
+
websocket_closed_event = asyncio.Event()
|
|
95
|
+
websocket_lock = asyncio.Lock()
|
|
78
96
|
|
|
79
|
-
await asyncio.gather(
|
|
80
|
-
|
|
97
|
+
await asyncio.gather(
|
|
98
|
+
stdin_to_websocket(stdin_reader, websocket,
|
|
99
|
+
timestamps_supported, websocket_closed_event,
|
|
100
|
+
websocket_lock),
|
|
101
|
+
websocket_to_stdout(websocket, stdout_writer,
|
|
102
|
+
timestamps_supported, last_ping_time_dict,
|
|
103
|
+
websocket_closed_event, websocket_lock),
|
|
104
|
+
latency_monitor(websocket, last_ping_time_dict,
|
|
105
|
+
websocket_closed_event, websocket_lock),
|
|
106
|
+
return_exceptions=True)
|
|
81
107
|
finally:
|
|
82
108
|
if old_settings:
|
|
83
109
|
termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
|
|
84
110
|
old_settings)
|
|
85
111
|
|
|
86
112
|
|
|
113
|
+
async def latency_monitor(websocket: ClientConnection,
|
|
114
|
+
last_ping_time_dict: Optional[dict],
|
|
115
|
+
websocket_closed_event: asyncio.Event,
|
|
116
|
+
websocket_lock: asyncio.Lock):
|
|
117
|
+
"""Periodically send PING messages (type 1) to measure latency."""
|
|
118
|
+
if last_ping_time_dict is None:
|
|
119
|
+
return
|
|
120
|
+
next_id = 0
|
|
121
|
+
while not websocket_closed_event.is_set():
|
|
122
|
+
try:
|
|
123
|
+
await asyncio.sleep(HEARTBEAT_INTERVAL_SECONDS)
|
|
124
|
+
if len(last_ping_time_dict) >= MAX_UNANSWERED_PINGS:
|
|
125
|
+
# We are not getting responses, clear the dictionary so
|
|
126
|
+
# as not to grow unbounded.
|
|
127
|
+
last_ping_time_dict.clear()
|
|
128
|
+
ping_time = time.time()
|
|
129
|
+
next_id += 1
|
|
130
|
+
last_ping_time_dict[next_id] = ping_time
|
|
131
|
+
message_header_bytes = struct.pack(
|
|
132
|
+
'!BI', KubernetesSSHMessageType.PINGPONG.value, next_id)
|
|
133
|
+
try:
|
|
134
|
+
async with websocket_lock:
|
|
135
|
+
await websocket.send(message_header_bytes)
|
|
136
|
+
except websockets.exceptions.ConnectionClosed as e:
|
|
137
|
+
# Websocket is already closed.
|
|
138
|
+
print(f'Failed to send PING message: {e}', file=sys.stderr)
|
|
139
|
+
break
|
|
140
|
+
except Exception as e:
|
|
141
|
+
print(f'Error in latency_monitor: {e}', file=sys.stderr)
|
|
142
|
+
websocket_closed_event.set()
|
|
143
|
+
raise e
|
|
144
|
+
|
|
145
|
+
|
|
87
146
|
async def stdin_to_websocket(reader: asyncio.StreamReader,
|
|
88
|
-
websocket: ClientConnection
|
|
147
|
+
websocket: ClientConnection,
|
|
148
|
+
timestamps_supported: bool,
|
|
149
|
+
websocket_closed_event: asyncio.Event,
|
|
150
|
+
websocket_lock: asyncio.Lock):
|
|
89
151
|
try:
|
|
90
|
-
while
|
|
152
|
+
while not websocket_closed_event.is_set():
|
|
91
153
|
# Read at most BUFFER_SIZE bytes, this not affect
|
|
92
154
|
# responsiveness since it will return as soon as
|
|
93
155
|
# there is at least one byte.
|
|
94
156
|
# The BUFFER_SIZE is chosen to be large enough to improve
|
|
95
157
|
# throughput.
|
|
96
158
|
data = await reader.read(BUFFER_SIZE)
|
|
159
|
+
|
|
97
160
|
if not data:
|
|
98
161
|
break
|
|
99
|
-
|
|
162
|
+
if timestamps_supported:
|
|
163
|
+
# Send message with type 0 to indicate data.
|
|
164
|
+
message_type_bytes = struct.pack(
|
|
165
|
+
'!B', KubernetesSSHMessageType.REGULAR_DATA.value)
|
|
166
|
+
data = message_type_bytes + data
|
|
167
|
+
async with websocket_lock:
|
|
168
|
+
await websocket.send(data)
|
|
169
|
+
|
|
100
170
|
except Exception as e: # pylint: disable=broad-except
|
|
101
171
|
print(f'Error in stdin_to_websocket: {e}', file=sys.stderr)
|
|
102
172
|
finally:
|
|
103
|
-
|
|
173
|
+
async with websocket_lock:
|
|
174
|
+
await websocket.close()
|
|
175
|
+
websocket_closed_event.set()
|
|
104
176
|
|
|
105
177
|
|
|
106
178
|
async def websocket_to_stdout(websocket: ClientConnection,
|
|
107
|
-
writer: asyncio.StreamWriter
|
|
179
|
+
writer: asyncio.StreamWriter,
|
|
180
|
+
timestamps_supported: bool,
|
|
181
|
+
last_ping_time_dict: Optional[dict],
|
|
182
|
+
websocket_closed_event: asyncio.Event,
|
|
183
|
+
websocket_lock: asyncio.Lock):
|
|
108
184
|
try:
|
|
109
|
-
while
|
|
185
|
+
while not websocket_closed_event.is_set():
|
|
110
186
|
message = await websocket.recv()
|
|
187
|
+
if (timestamps_supported and len(message) > 0 and
|
|
188
|
+
last_ping_time_dict is not None):
|
|
189
|
+
message_type = struct.unpack('!B', message[:1])[0]
|
|
190
|
+
if message_type == KubernetesSSHMessageType.REGULAR_DATA.value:
|
|
191
|
+
# Regular data - strip type byte and write to stdout
|
|
192
|
+
message = message[1:]
|
|
193
|
+
elif message_type == KubernetesSSHMessageType.PINGPONG.value:
|
|
194
|
+
# PONG response - calculate latency and send measurement
|
|
195
|
+
if not len(message) == struct.calcsize('!BI'):
|
|
196
|
+
raise ValueError(
|
|
197
|
+
f'Invalid PONG message length: {len(message)}')
|
|
198
|
+
pong_id = struct.unpack('!I', message[1:5])[0]
|
|
199
|
+
pong_time = time.time()
|
|
200
|
+
|
|
201
|
+
ping_time = last_ping_time_dict.pop(pong_id, None)
|
|
202
|
+
|
|
203
|
+
if ping_time is None:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
latency_seconds = pong_time - ping_time
|
|
207
|
+
latency_ms = int(latency_seconds * 1000)
|
|
208
|
+
|
|
209
|
+
# Send latency measurement (type 2)
|
|
210
|
+
message_type_bytes = struct.pack(
|
|
211
|
+
'!B',
|
|
212
|
+
KubernetesSSHMessageType.LATENCY_MEASUREMENT.value)
|
|
213
|
+
latency_bytes = struct.pack('!Q', latency_ms)
|
|
214
|
+
message = message_type_bytes + latency_bytes
|
|
215
|
+
# Send to server.
|
|
216
|
+
async with websocket_lock:
|
|
217
|
+
await websocket.send(message)
|
|
218
|
+
continue
|
|
219
|
+
# No timestamps support, write directly
|
|
111
220
|
writer.write(message)
|
|
112
221
|
await writer.drain()
|
|
113
222
|
except websockets.exceptions.ConnectionClosed:
|
|
114
223
|
print('WebSocket connection closed', file=sys.stderr)
|
|
115
224
|
except Exception as e: # pylint: disable=broad-except
|
|
116
225
|
print(f'Error in websocket_to_stdout: {e}', file=sys.stderr)
|
|
226
|
+
raise e
|
|
227
|
+
finally:
|
|
228
|
+
async with websocket_lock:
|
|
229
|
+
await websocket.close()
|
|
230
|
+
websocket_closed_event.set()
|
|
117
231
|
|
|
118
232
|
|
|
119
233
|
if __name__ == '__main__':
|
|
@@ -123,11 +237,27 @@ if __name__ == '__main__':
|
|
|
123
237
|
# TODO(aylei): Remove this after 0.10.0
|
|
124
238
|
server_url = f'http://{server_url}'
|
|
125
239
|
|
|
240
|
+
disable_latency_measurement = os.environ.get(
|
|
241
|
+
skylet_constants.SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR, '0') == '1'
|
|
242
|
+
if disable_latency_measurement:
|
|
243
|
+
timestamps_are_supported = False
|
|
244
|
+
else:
|
|
245
|
+
health_url = f'{server_url}/api/health'
|
|
246
|
+
cookie_hdr = _get_cookie_header(health_url)
|
|
247
|
+
health_response = requests.get(health_url, headers=cookie_hdr)
|
|
248
|
+
health_data = health_response.json()
|
|
249
|
+
timestamps_are_supported = int(health_data.get('api_version', 0)) > 21
|
|
250
|
+
|
|
126
251
|
server_proto, server_fqdn = server_url.split('://')
|
|
127
252
|
websocket_proto = 'ws'
|
|
128
253
|
if server_proto == 'https':
|
|
129
254
|
websocket_proto = 'wss'
|
|
130
255
|
server_url = f'{websocket_proto}://{server_fqdn}'
|
|
256
|
+
|
|
257
|
+
client_version_str = (f'&client_version={constants.API_VERSION}'
|
|
258
|
+
if timestamps_are_supported else '')
|
|
259
|
+
|
|
131
260
|
websocket_url = (f'{server_url}/kubernetes-pod-ssh-proxy'
|
|
132
|
-
f'?cluster_name={sys.argv[2]}'
|
|
133
|
-
|
|
261
|
+
f'?cluster_name={sys.argv[2]}'
|
|
262
|
+
f'{client_version_str}')
|
|
263
|
+
asyncio.run(main(websocket_url, timestamps_are_supported))
|
sky/users/permission.py
CHANGED
|
@@ -14,6 +14,7 @@ from sky import models
|
|
|
14
14
|
from sky import sky_logging
|
|
15
15
|
from sky.skylet import constants
|
|
16
16
|
from sky.users import rbac
|
|
17
|
+
from sky.utils import annotations
|
|
17
18
|
from sky.utils import common_utils
|
|
18
19
|
from sky.utils.db import db_utils
|
|
19
20
|
|
|
@@ -42,7 +43,6 @@ class PermissionService:
|
|
|
42
43
|
with _policy_lock():
|
|
43
44
|
global _enforcer_instance
|
|
44
45
|
if _enforcer_instance is None:
|
|
45
|
-
_enforcer_instance = self
|
|
46
46
|
engine = global_user_state.initialize_and_get_db()
|
|
47
47
|
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
48
48
|
sqlalchemy_adapter.Base.metadata, engine)
|
|
@@ -52,6 +52,10 @@ class PermissionService:
|
|
|
52
52
|
'model.conf')
|
|
53
53
|
enforcer = casbin.Enforcer(model_path, adapter)
|
|
54
54
|
self.enforcer = enforcer
|
|
55
|
+
# Only set the enforcer instance once the enforcer
|
|
56
|
+
# is successfully initialized, if we change it and then fail
|
|
57
|
+
# we will set it to None and all subsequent calls will fail.
|
|
58
|
+
_enforcer_instance = self
|
|
55
59
|
self._maybe_initialize_policies()
|
|
56
60
|
self._maybe_initialize_basic_auth_user()
|
|
57
61
|
else:
|
|
@@ -254,6 +258,9 @@ class PermissionService:
|
|
|
254
258
|
with _policy_lock():
|
|
255
259
|
self._load_policy_no_lock()
|
|
256
260
|
|
|
261
|
+
# Right now, not a lot of users are using multiple workspaces,
|
|
262
|
+
# so 5 should be more than enough.
|
|
263
|
+
@annotations.lru_cache(scope='request', maxsize=5)
|
|
257
264
|
def check_workspace_permission(self, user_id: str,
|
|
258
265
|
workspace_name: str) -> bool:
|
|
259
266
|
"""Check workspace permission.
|
sky/utils/admin_policy_utils.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import contextlib
|
|
3
3
|
import copy
|
|
4
4
|
import importlib
|
|
5
|
+
import typing
|
|
5
6
|
from typing import Iterator, Optional, Tuple, Union
|
|
6
7
|
import urllib.parse
|
|
7
8
|
|
|
@@ -13,12 +14,16 @@ from sky import exceptions
|
|
|
13
14
|
from sky import sky_logging
|
|
14
15
|
from sky import skypilot_config
|
|
15
16
|
from sky import task as task_lib
|
|
17
|
+
from sky.server.requests import request_names
|
|
16
18
|
from sky.utils import common_utils
|
|
17
19
|
from sky.utils import config_utils
|
|
18
20
|
from sky.utils import ux_utils
|
|
19
21
|
|
|
20
22
|
logger = sky_logging.init_logger(__name__)
|
|
21
23
|
|
|
24
|
+
if typing.TYPE_CHECKING:
|
|
25
|
+
from sky import models
|
|
26
|
+
|
|
22
27
|
|
|
23
28
|
def _is_url(policy_string: str) -> bool:
|
|
24
29
|
"""Check if the policy string is a URL."""
|
|
@@ -73,6 +78,7 @@ def _get_policy_impl(
|
|
|
73
78
|
@contextlib.contextmanager
|
|
74
79
|
def apply_and_use_config_in_current_request(
|
|
75
80
|
entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
|
|
81
|
+
request_name: request_names.AdminPolicyRequestName,
|
|
76
82
|
request_options: Optional[admin_policy.RequestOptions] = None,
|
|
77
83
|
at_client_side: bool = False,
|
|
78
84
|
) -> Iterator['dag_lib.Dag']:
|
|
@@ -86,7 +92,8 @@ def apply_and_use_config_in_current_request(
|
|
|
86
92
|
Refer to `apply()` for more details.
|
|
87
93
|
"""
|
|
88
94
|
original_config = skypilot_config.to_dict()
|
|
89
|
-
dag, mutated_config = apply(entrypoint, request_options,
|
|
95
|
+
dag, mutated_config = apply(entrypoint, request_name, request_options,
|
|
96
|
+
at_client_side)
|
|
90
97
|
if mutated_config != original_config:
|
|
91
98
|
with skypilot_config.replace_skypilot_config(mutated_config):
|
|
92
99
|
yield dag
|
|
@@ -96,6 +103,7 @@ def apply_and_use_config_in_current_request(
|
|
|
96
103
|
|
|
97
104
|
def apply(
|
|
98
105
|
entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
|
|
106
|
+
request_name: request_names.AdminPolicyRequestName,
|
|
99
107
|
request_options: Optional[admin_policy.RequestOptions] = None,
|
|
100
108
|
at_client_side: bool = False,
|
|
101
109
|
) -> Tuple['dag_lib.Dag', config_utils.Config]:
|
|
@@ -126,9 +134,13 @@ def apply(
|
|
|
126
134
|
if policy is None:
|
|
127
135
|
return dag, skypilot_config.to_dict()
|
|
128
136
|
|
|
137
|
+
user = None
|
|
129
138
|
if at_client_side:
|
|
130
139
|
logger.info(f'Applying client admin policy: {policy}')
|
|
131
140
|
else:
|
|
141
|
+
# When being called by the server, the middleware has set the
|
|
142
|
+
# current user and this information is available at this point.
|
|
143
|
+
user = common_utils.get_current_user()
|
|
132
144
|
logger.info(f'Applying server admin policy: {policy}')
|
|
133
145
|
config = copy.deepcopy(skypilot_config.to_dict())
|
|
134
146
|
mutated_dag = dag_lib.Dag()
|
|
@@ -136,8 +148,9 @@ def apply(
|
|
|
136
148
|
|
|
137
149
|
mutated_config = None
|
|
138
150
|
for task in dag.tasks:
|
|
139
|
-
user_request = admin_policy.UserRequest(task, config,
|
|
140
|
-
at_client_side
|
|
151
|
+
user_request = admin_policy.UserRequest(task, config, request_name,
|
|
152
|
+
request_options, at_client_side,
|
|
153
|
+
user)
|
|
141
154
|
try:
|
|
142
155
|
mutated_user_request = policy.apply(user_request)
|
|
143
156
|
# Avoid duplicate exception wrapping.
|