skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/skylet/log_lib.py
CHANGED
|
@@ -4,14 +4,17 @@ This is a remote utility module that provides logging functionality.
|
|
|
4
4
|
"""
|
|
5
5
|
import collections
|
|
6
6
|
import copy
|
|
7
|
+
import functools
|
|
7
8
|
import io
|
|
8
9
|
import multiprocessing.pool
|
|
9
10
|
import os
|
|
11
|
+
import queue as queue_lib
|
|
10
12
|
import shlex
|
|
11
13
|
import subprocess
|
|
12
14
|
import sys
|
|
13
15
|
import tempfile
|
|
14
16
|
import textwrap
|
|
17
|
+
import threading
|
|
15
18
|
import time
|
|
16
19
|
from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
|
|
17
20
|
Tuple, Union)
|
|
@@ -21,6 +24,8 @@ import colorama
|
|
|
21
24
|
from sky import sky_logging
|
|
22
25
|
from sky.skylet import constants
|
|
23
26
|
from sky.skylet import job_lib
|
|
27
|
+
from sky.utils import context
|
|
28
|
+
from sky.utils import context_utils
|
|
24
29
|
from sky.utils import log_utils
|
|
25
30
|
from sky.utils import subprocess_utils
|
|
26
31
|
from sky.utils import ux_utils
|
|
@@ -36,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
|
|
|
36
41
|
|
|
37
42
|
LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
|
|
38
43
|
|
|
44
|
+
# 16-64KiB seems to be the sweet spot:
|
|
45
|
+
# https://github.com/grpc/grpc.github.io/issues/371
|
|
46
|
+
# TODO(kevin): Benchmark this ourselves and verify.
|
|
47
|
+
DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
|
|
48
|
+
|
|
39
49
|
|
|
40
50
|
class _ProcessingArgs:
|
|
41
51
|
"""Arguments for processing logs."""
|
|
@@ -59,6 +69,16 @@ class _ProcessingArgs:
|
|
|
59
69
|
self.streaming_prefix = streaming_prefix
|
|
60
70
|
|
|
61
71
|
|
|
72
|
+
def _get_context():
|
|
73
|
+
# TODO(aylei): remove this after we drop the backward-compatibility for
|
|
74
|
+
# 0.9.x in 0.12.0
|
|
75
|
+
# Keep backward-compatibility for the old version of SkyPilot runtimes.
|
|
76
|
+
if 'context' in globals():
|
|
77
|
+
return context.get()
|
|
78
|
+
else:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
|
|
62
82
|
def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
|
|
63
83
|
"""Process the stream of a process."""
|
|
64
84
|
out_io = io.TextIOWrapper(io_stream,
|
|
@@ -77,6 +97,9 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
|
|
|
77
97
|
with open(args.log_path, 'a', encoding='utf-8') as fout:
|
|
78
98
|
with line_processor:
|
|
79
99
|
while True:
|
|
100
|
+
ctx = _get_context()
|
|
101
|
+
if ctx is not None and ctx.is_canceled():
|
|
102
|
+
return
|
|
80
103
|
line = out_io.readline()
|
|
81
104
|
if not line:
|
|
82
105
|
break
|
|
@@ -111,26 +134,24 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
|
|
|
111
134
|
return ''.join(out)
|
|
112
135
|
|
|
113
136
|
|
|
114
|
-
def process_subprocess_stream(proc,
|
|
115
|
-
|
|
137
|
+
def process_subprocess_stream(proc, stdout_stream_handler,
|
|
138
|
+
stderr_stream_handler) -> Tuple[str, str]:
|
|
139
|
+
"""Process the stream of a process in threads, blocking."""
|
|
116
140
|
if proc.stderr is not None:
|
|
117
141
|
# Asyncio does not work as the output processing can be executed in a
|
|
118
142
|
# different thread.
|
|
119
143
|
# selectors is possible to handle the multiplexing of stdout/stderr,
|
|
120
144
|
# but it introduces buffering making the output not streaming.
|
|
121
145
|
with multiprocessing.pool.ThreadPool(processes=1) as pool:
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
stderr_fut = pool.apply_async(_handle_io_stream,
|
|
125
|
-
args=(proc.stderr, sys.stderr,
|
|
126
|
-
err_args))
|
|
146
|
+
stderr_fut = pool.apply_async(stderr_stream_handler,
|
|
147
|
+
args=(proc.stderr, sys.stderr))
|
|
127
148
|
# Do not launch a thread for stdout as the rich.status does not
|
|
128
149
|
# work in a thread, which is used in
|
|
129
150
|
# log_utils.RayUpLineProcessor.
|
|
130
|
-
stdout =
|
|
151
|
+
stdout = stdout_stream_handler(proc.stdout, sys.stdout)
|
|
131
152
|
stderr = stderr_fut.get()
|
|
132
153
|
else:
|
|
133
|
-
stdout =
|
|
154
|
+
stdout = stdout_stream_handler(proc.stdout, sys.stdout)
|
|
134
155
|
stderr = ''
|
|
135
156
|
return stdout, stderr
|
|
136
157
|
|
|
@@ -176,7 +197,12 @@ def run_with_log(
|
|
|
176
197
|
# Redirect stderr to stdout when using ray, to preserve the order of
|
|
177
198
|
# stdout and stderr.
|
|
178
199
|
stdout_arg = stderr_arg = None
|
|
179
|
-
|
|
200
|
+
ctx = _get_context()
|
|
201
|
+
if process_stream or ctx is not None:
|
|
202
|
+
# Capture stdout/stderr of the subprocess if:
|
|
203
|
+
# 1. Post-processing is needed (process_stream=True)
|
|
204
|
+
# 2. Potential contextual handling is needed (ctx is not None)
|
|
205
|
+
# TODO(aylei): can we always capture the stdout/stderr?
|
|
180
206
|
stdout_arg = subprocess.PIPE
|
|
181
207
|
stderr_arg = subprocess.PIPE if not with_ray else subprocess.STDOUT
|
|
182
208
|
# Use stdin=subprocess.DEVNULL by default, as allowing inputs will mess up
|
|
@@ -194,9 +220,18 @@ def run_with_log(
|
|
|
194
220
|
stdin=stdin,
|
|
195
221
|
**kwargs) as proc:
|
|
196
222
|
try:
|
|
197
|
-
|
|
223
|
+
if ctx is not None:
|
|
224
|
+
# When runs in coroutine, use kill_pg if available to avoid
|
|
225
|
+
# the overhead of refreshing the process tree in the daemon.
|
|
226
|
+
subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
|
|
227
|
+
else:
|
|
228
|
+
# For backward compatibility, do not specify use_kill_pg by
|
|
229
|
+
# default.
|
|
230
|
+
subprocess_utils.kill_process_daemon(proc.pid)
|
|
198
231
|
stdout = ''
|
|
199
232
|
stderr = ''
|
|
233
|
+
stdout_stream_handler = None
|
|
234
|
+
stderr_stream_handler = None
|
|
200
235
|
|
|
201
236
|
if process_stream:
|
|
202
237
|
if skip_lines is None:
|
|
@@ -223,7 +258,34 @@ def run_with_log(
|
|
|
223
258
|
replace_crlf=with_ray,
|
|
224
259
|
streaming_prefix=streaming_prefix,
|
|
225
260
|
)
|
|
226
|
-
|
|
261
|
+
stdout_stream_handler = functools.partial(
|
|
262
|
+
_handle_io_stream,
|
|
263
|
+
args=args,
|
|
264
|
+
)
|
|
265
|
+
if proc.stderr is not None:
|
|
266
|
+
err_args = copy.copy(args)
|
|
267
|
+
err_args.line_processor = None
|
|
268
|
+
stderr_stream_handler = functools.partial(
|
|
269
|
+
_handle_io_stream,
|
|
270
|
+
args=err_args,
|
|
271
|
+
)
|
|
272
|
+
if ctx is not None:
|
|
273
|
+
# When runs in a coroutine, always process the subprocess
|
|
274
|
+
# stream to:
|
|
275
|
+
# 1. handle context cancellation
|
|
276
|
+
# 2. redirect subprocess stdout/stderr to the contextual
|
|
277
|
+
# stdout/stderr of current coroutine.
|
|
278
|
+
stdout, stderr = context_utils.pipe_and_wait_process(
|
|
279
|
+
ctx,
|
|
280
|
+
proc,
|
|
281
|
+
stdout_stream_handler=stdout_stream_handler,
|
|
282
|
+
stderr_stream_handler=stderr_stream_handler)
|
|
283
|
+
elif process_stream:
|
|
284
|
+
# When runs in a process, only process subprocess stream if
|
|
285
|
+
# necessary to avoid unnecessary stream handling overhead.
|
|
286
|
+
stdout, stderr = process_subprocess_stream(
|
|
287
|
+
proc, stdout_stream_handler, stderr_stream_handler)
|
|
288
|
+
# Ensure returncode is set.
|
|
227
289
|
proc.wait()
|
|
228
290
|
if require_outputs:
|
|
229
291
|
return proc.returncode, stdout, stderr
|
|
@@ -305,6 +367,17 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
305
367
|
shell=True)
|
|
306
368
|
|
|
307
369
|
|
|
370
|
+
def run_bash_command_with_log_and_return_pid(
|
|
371
|
+
bash_command: str,
|
|
372
|
+
log_path: str,
|
|
373
|
+
env_vars: Optional[Dict[str, str]] = None,
|
|
374
|
+
stream_logs: bool = False,
|
|
375
|
+
with_ray: bool = False):
|
|
376
|
+
return_code = run_bash_command_with_log(bash_command, log_path, env_vars,
|
|
377
|
+
stream_logs, with_ray)
|
|
378
|
+
return {'return_code': return_code, 'pid': os.getpid()}
|
|
379
|
+
|
|
380
|
+
|
|
308
381
|
def _follow_job_logs(file,
|
|
309
382
|
job_id: int,
|
|
310
383
|
start_streaming: bool,
|
|
@@ -346,9 +419,9 @@ def _follow_job_logs(file,
|
|
|
346
419
|
wait_last_logs = False
|
|
347
420
|
continue
|
|
348
421
|
status_str = status.value if status is not None else 'None'
|
|
349
|
-
|
|
350
|
-
f'Job finished (status: {status_str}).')
|
|
351
|
-
|
|
422
|
+
finish = ux_utils.finishing_message(
|
|
423
|
+
f'Job finished (status: {status_str}).')
|
|
424
|
+
yield finish + '\n'
|
|
352
425
|
return
|
|
353
426
|
|
|
354
427
|
time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
|
|
@@ -495,9 +568,215 @@ def tail_logs(job_id: Optional[int],
|
|
|
495
568
|
if start_streaming:
|
|
496
569
|
print(line, end='', flush=True)
|
|
497
570
|
status_str = status.value if status is not None else 'None'
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
571
|
+
# Only show "Job finished" for actually terminal states
|
|
572
|
+
if status is not None and status.is_terminal():
|
|
573
|
+
print(ux_utils.finishing_message(
|
|
574
|
+
f'Job finished (status: {status_str}).'),
|
|
575
|
+
flush=True)
|
|
501
576
|
except FileNotFoundError:
|
|
502
577
|
print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
503
578
|
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
582
|
+
log_dir: Optional[str],
|
|
583
|
+
managed_job_id: Optional[int] = None,
|
|
584
|
+
follow: bool = True,
|
|
585
|
+
tail: int = 0) -> Iterator[str]:
|
|
586
|
+
"""Tail the logs of a job. This is mostly the same as tail_logs, but
|
|
587
|
+
returns an iterator instead of printing to stdout/stderr."""
|
|
588
|
+
if job_id is None:
|
|
589
|
+
# This only happens when job_lib.get_latest_job_id() returns None,
|
|
590
|
+
# which means no job has been submitted to this cluster. See
|
|
591
|
+
# sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
|
|
592
|
+
logger.info('Skip streaming logs as no job has been submitted.')
|
|
593
|
+
return
|
|
594
|
+
job_str = f'job {job_id}'
|
|
595
|
+
if managed_job_id is not None:
|
|
596
|
+
job_str = f'managed job {managed_job_id}'
|
|
597
|
+
if log_dir is None:
|
|
598
|
+
msg = f'{job_str.capitalize()} not found (see `sky queue`).'
|
|
599
|
+
yield msg + '\n'
|
|
600
|
+
return
|
|
601
|
+
logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
|
|
602
|
+
f'{managed_job_id}.')
|
|
603
|
+
log_path = os.path.join(log_dir, 'run.log')
|
|
604
|
+
log_path = os.path.expanduser(log_path)
|
|
605
|
+
|
|
606
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
607
|
+
|
|
608
|
+
# Wait for the log to be written. This is needed due to the `ray submit`
|
|
609
|
+
# will take some time to start the job and write the log.
|
|
610
|
+
retry_cnt = 0
|
|
611
|
+
while status is not None and not status.is_terminal():
|
|
612
|
+
retry_cnt += 1
|
|
613
|
+
if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
|
|
614
|
+
break
|
|
615
|
+
if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
|
|
616
|
+
err = (f'{colorama.Fore.RED}ERROR: Logs for '
|
|
617
|
+
f'{job_str} (status: {status.value}) does not exist '
|
|
618
|
+
f'after retrying {retry_cnt} times.'
|
|
619
|
+
f'{colorama.Style.RESET_ALL}')
|
|
620
|
+
yield err + '\n'
|
|
621
|
+
return
|
|
622
|
+
waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
|
|
623
|
+
'to be written...')
|
|
624
|
+
yield waiting + '\n'
|
|
625
|
+
time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
|
|
626
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
627
|
+
|
|
628
|
+
start_stream_at = LOG_FILE_START_STREAMING_AT
|
|
629
|
+
# Explicitly declare the type to avoid mypy warning.
|
|
630
|
+
lines: Iterable[str] = []
|
|
631
|
+
if follow and status in [
|
|
632
|
+
job_lib.JobStatus.SETTING_UP,
|
|
633
|
+
job_lib.JobStatus.PENDING,
|
|
634
|
+
job_lib.JobStatus.RUNNING,
|
|
635
|
+
]:
|
|
636
|
+
# Not using `ray job logs` because it will put progress bar in
|
|
637
|
+
# multiple lines.
|
|
638
|
+
with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
|
|
639
|
+
# Using `_follow` instead of `tail -f` to streaming the whole
|
|
640
|
+
# log and creating a new process for tail.
|
|
641
|
+
start_streaming = False
|
|
642
|
+
if tail > 0:
|
|
643
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
644
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
645
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
646
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
647
|
+
for line in lines:
|
|
648
|
+
if start_stream_at in line:
|
|
649
|
+
start_streaming = True
|
|
650
|
+
if start_streaming:
|
|
651
|
+
yield line
|
|
652
|
+
# Now, the cursor is at the end of the last lines
|
|
653
|
+
# if tail > 0
|
|
654
|
+
for line in _follow_job_logs(log_file,
|
|
655
|
+
job_id=job_id,
|
|
656
|
+
start_streaming=start_streaming,
|
|
657
|
+
start_streaming_at=start_stream_at):
|
|
658
|
+
yield line
|
|
659
|
+
else:
|
|
660
|
+
try:
|
|
661
|
+
start_streaming = False
|
|
662
|
+
with open(log_path, 'r', encoding='utf-8') as log_file:
|
|
663
|
+
if tail > 0:
|
|
664
|
+
# If tail > 0, we need to read the last n lines.
|
|
665
|
+
# We use double ended queue to rotate the last n lines.
|
|
666
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
667
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
668
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
669
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
670
|
+
else:
|
|
671
|
+
lines = log_file
|
|
672
|
+
for line in lines:
|
|
673
|
+
if start_stream_at in line:
|
|
674
|
+
start_streaming = True
|
|
675
|
+
if start_streaming:
|
|
676
|
+
yield line
|
|
677
|
+
status_str = status.value if status is not None else 'None'
|
|
678
|
+
# Only show "Job finished" for actually terminal states
|
|
679
|
+
if status is not None and status.is_terminal():
|
|
680
|
+
finish = ux_utils.finishing_message(
|
|
681
|
+
f'Job finished (status: {status_str}).')
|
|
682
|
+
yield finish + '\n'
|
|
683
|
+
return
|
|
684
|
+
except FileNotFoundError:
|
|
685
|
+
err = (
|
|
686
|
+
f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
687
|
+
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
688
|
+
yield err + '\n'
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
class LogBuffer:
|
|
692
|
+
"""In-memory buffer for chunking log lines for streaming."""
|
|
693
|
+
|
|
694
|
+
def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
|
|
695
|
+
"""Initialize the log buffer.
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
max_chars: Maximum buffer size (in characters, not bytes) before
|
|
699
|
+
flushing. The actual amount of bytes (UTF-8 encoding)
|
|
700
|
+
could be more than this, depending on the characters,
|
|
701
|
+
i.e. ASCII characters take 1 byte, while others
|
|
702
|
+
may take 2-4 bytes. But this is fine as our default
|
|
703
|
+
chunk size is well below the default value of
|
|
704
|
+
grpc.max_receive_message_length which is 4MB.
|
|
705
|
+
"""
|
|
706
|
+
self.max_chars = max_chars
|
|
707
|
+
self._buffer = io.StringIO()
|
|
708
|
+
|
|
709
|
+
def _should_flush(self) -> bool:
|
|
710
|
+
return self._buffer.tell() >= self.max_chars
|
|
711
|
+
|
|
712
|
+
def flush(self) -> str:
|
|
713
|
+
"""Get the current buffered content and clear the buffer.
|
|
714
|
+
|
|
715
|
+
Returns:
|
|
716
|
+
The buffered log lines as a single string
|
|
717
|
+
"""
|
|
718
|
+
if not self._buffer.tell():
|
|
719
|
+
return ''
|
|
720
|
+
chunk = self._buffer.getvalue()
|
|
721
|
+
self._buffer.truncate(0)
|
|
722
|
+
self._buffer.seek(0)
|
|
723
|
+
return chunk
|
|
724
|
+
|
|
725
|
+
def write(self, line: str) -> bool:
|
|
726
|
+
"""Add a line to the buffer.
|
|
727
|
+
|
|
728
|
+
Args:
|
|
729
|
+
line: The log line to add
|
|
730
|
+
|
|
731
|
+
Returns:
|
|
732
|
+
True if buffer should be flushed after adding the line
|
|
733
|
+
"""
|
|
734
|
+
self._buffer.write(line)
|
|
735
|
+
return self._should_flush()
|
|
736
|
+
|
|
737
|
+
def close(self):
|
|
738
|
+
self._buffer.close()
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
742
|
+
timeout: float) -> Iterable[str]:
|
|
743
|
+
"""Iterates over an iterable, writing each item to a buffer,
|
|
744
|
+
and flushing the buffer when it is full or no item is
|
|
745
|
+
yielded within the timeout duration."""
|
|
746
|
+
# TODO(kevin): Simplify this using asyncio.timeout, once we move
|
|
747
|
+
# the skylet event loop and gRPC server to asyncio.
|
|
748
|
+
# https://docs.python.org/3/library/asyncio-task.html#timeouts
|
|
749
|
+
|
|
750
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
751
|
+
sentinel = object()
|
|
752
|
+
|
|
753
|
+
def producer():
|
|
754
|
+
try:
|
|
755
|
+
for item in iterable:
|
|
756
|
+
queue.put(item)
|
|
757
|
+
finally:
|
|
758
|
+
queue.put(sentinel)
|
|
759
|
+
|
|
760
|
+
thread = threading.Thread(target=producer, daemon=True)
|
|
761
|
+
thread.start()
|
|
762
|
+
|
|
763
|
+
while True:
|
|
764
|
+
try:
|
|
765
|
+
item = queue.get(timeout=timeout)
|
|
766
|
+
except queue_lib.Empty:
|
|
767
|
+
out = buffer.flush()
|
|
768
|
+
if out:
|
|
769
|
+
yield out
|
|
770
|
+
continue
|
|
771
|
+
|
|
772
|
+
if item is sentinel:
|
|
773
|
+
thread.join()
|
|
774
|
+
out = buffer.flush()
|
|
775
|
+
if out:
|
|
776
|
+
yield out
|
|
777
|
+
return
|
|
778
|
+
|
|
779
|
+
if buffer.write(item):
|
|
780
|
+
out = buffer.flush()
|
|
781
|
+
if out:
|
|
782
|
+
yield out
|
sky/skylet/log_lib.pyi
CHANGED
|
@@ -4,13 +4,14 @@ overloaded type hints for run_with_log(), as we need to determine
|
|
|
4
4
|
the return type based on the value of require_outputs.
|
|
5
5
|
"""
|
|
6
6
|
import typing
|
|
7
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
7
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
from typing_extensions import Literal
|
|
10
10
|
|
|
11
11
|
from sky import sky_logging as sky_logging
|
|
12
12
|
from sky.skylet import constants as constants
|
|
13
13
|
from sky.skylet import job_lib as job_lib
|
|
14
|
+
from sky.utils import context
|
|
14
15
|
from sky.utils import log_utils as log_utils
|
|
15
16
|
|
|
16
17
|
SKY_LOG_WAITING_GAP_SECONDS: int = ...
|
|
@@ -41,6 +42,10 @@ class _ProcessingArgs:
|
|
|
41
42
|
...
|
|
42
43
|
|
|
43
44
|
|
|
45
|
+
def _get_context() -> Optional[context.SkyPilotContext]:
|
|
46
|
+
...
|
|
47
|
+
|
|
48
|
+
|
|
44
49
|
def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
|
|
45
50
|
...
|
|
46
51
|
|
|
@@ -124,8 +129,46 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
124
129
|
...
|
|
125
130
|
|
|
126
131
|
|
|
132
|
+
def run_bash_command_with_log_and_return_pid(
|
|
133
|
+
bash_command: str,
|
|
134
|
+
log_path: str,
|
|
135
|
+
env_vars: Optional[Dict[str, str]] = ...,
|
|
136
|
+
stream_logs: bool = ...,
|
|
137
|
+
with_ray: bool = ...):
|
|
138
|
+
...
|
|
139
|
+
|
|
140
|
+
|
|
127
141
|
def tail_logs(job_id: int,
|
|
128
142
|
log_dir: Optional[str],
|
|
129
143
|
managed_job_id: Optional[int] = ...,
|
|
130
144
|
follow: bool = ...) -> None:
|
|
131
145
|
...
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
149
|
+
log_dir: Optional[str],
|
|
150
|
+
managed_job_id: Optional[int] = ...,
|
|
151
|
+
follow: bool = ...,
|
|
152
|
+
tail: int = ...) -> Iterator[str]:
|
|
153
|
+
...
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class LogBuffer:
|
|
157
|
+
max_chars: int
|
|
158
|
+
|
|
159
|
+
def __init__(self, max_chars: int = ...):
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
def flush(self) -> str:
|
|
163
|
+
...
|
|
164
|
+
|
|
165
|
+
def write(self, line: str) -> bool:
|
|
166
|
+
...
|
|
167
|
+
|
|
168
|
+
def close(self):
|
|
169
|
+
...
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
173
|
+
timeout: float) -> Iterable[str]:
|
|
174
|
+
...
|
|
@@ -40,15 +40,29 @@ def _run_patch(target_file,
|
|
|
40
40
|
"""Applies a patch if it has not been applied already."""
|
|
41
41
|
# .orig is the original file that is not patched.
|
|
42
42
|
orig_file = os.path.abspath(f'{target_file}-v{version}.orig')
|
|
43
|
+
# Get diff filename by replacing .patch with .diff
|
|
44
|
+
diff_file = patch_file.replace('.patch', '.diff')
|
|
45
|
+
|
|
43
46
|
script = f"""\
|
|
44
47
|
which patch >/dev/null 2>&1 || sudo yum install -y patch || true
|
|
45
|
-
which patch >/dev/null 2>&1 || (echo "`patch` is not found. Failed to setup ray." && exit 1)
|
|
46
48
|
if [ ! -f {orig_file} ]; then
|
|
47
49
|
echo Create backup file {orig_file}
|
|
48
50
|
cp {target_file} {orig_file}
|
|
49
51
|
fi
|
|
50
|
-
|
|
51
|
-
|
|
52
|
+
if which patch >/dev/null 2>&1; then
|
|
53
|
+
# System patch command is available, use it
|
|
54
|
+
# It is ok to patch again from the original file.
|
|
55
|
+
patch {orig_file} -i {patch_file} -o {target_file}
|
|
56
|
+
else
|
|
57
|
+
# System patch command not available, use Python patch library
|
|
58
|
+
echo "System patch command not available, using Python patch library..."
|
|
59
|
+
python -m pip install patch
|
|
60
|
+
# Get target directory
|
|
61
|
+
target_dir="$(dirname {target_file})"
|
|
62
|
+
# Execute python patch command
|
|
63
|
+
echo "Executing python -m patch -d $target_dir {diff_file}"
|
|
64
|
+
python -m patch -d "$target_dir" "{diff_file}"
|
|
65
|
+
fi
|
|
52
66
|
"""
|
|
53
67
|
subprocess.run(script, shell=True, check=True)
|
|
54
68
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
--- a/autoscaler.py
|
|
2
|
+
+++ b/autoscaler.py
|
|
3
|
+
@@ -1,3 +1,6 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py
|
|
5
|
+
+# Sky patch changes:
|
|
6
|
+
+# - enable upscaling_speed to be 0.0
|
|
7
|
+
import copy
|
|
8
|
+
import logging
|
|
9
|
+
import math
|
|
10
|
+
@@ -1071,7 +1074,7 @@
|
|
11
|
+
upscaling_speed = self.config.get("upscaling_speed")
|
|
12
|
+
aggressive = self.config.get("autoscaling_mode") == "aggressive"
|
|
13
|
+
target_utilization_fraction = self.config.get("target_utilization_fraction")
|
|
14
|
+
- if upscaling_speed:
|
|
15
|
+
+ if upscaling_speed is not None: # NOTE(sky): enable 0.0
|
|
16
|
+
upscaling_speed = float(upscaling_speed)
|
|
17
|
+
# TODO(ameer): consider adding (if users ask) an option of
|
|
18
|
+
# initial_upscaling_num_workers.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
--- a/cli.py
|
|
2
|
+
+++ b/cli.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py
|
|
5
|
+
+# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514
|
|
6
|
+
+# Otherwise, the output redirection ">" will not work.
|
|
7
|
+
+
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
@@ -270,7 +274,7 @@
|
|
12
|
+
working_dir=working_dir,
|
|
13
|
+
)
|
|
14
|
+
job_id = client.submit_job(
|
|
15
|
+
- entrypoint=list2cmdline(entrypoint),
|
|
16
|
+
+ entrypoint=" ".join(entrypoint),
|
|
17
|
+
submission_id=submission_id,
|
|
18
|
+
runtime_env=final_runtime_env,
|
|
19
|
+
metadata=metadata_json,
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
--- a/command_runner.py
|
|
2
|
+
+++ b/command_runner.py
|
|
3
|
+
@@ -1,3 +1,5 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/command_runner.py
|
|
5
|
+
+
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
@@ -137,7 +139,7 @@
|
|
10
|
+
{
|
|
11
|
+
"ControlMaster": "auto",
|
|
12
|
+
"ControlPath": "{}/%C".format(control_path),
|
|
13
|
+
- "ControlPersist": "10s",
|
|
14
|
+
+ "ControlPersist": "300s",
|
|
15
|
+
}
|
|
16
|
+
)
|
|
17
|
+
self.arg_dict.update(kwargs)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
--- a/log_monitor.py
|
|
2
|
+
+++ b/log_monitor.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py
|
|
5
|
+
+# Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar.
|
|
6
|
+
+# We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
|
|
7
|
+
+
|
|
8
|
+
import argparse
|
|
9
|
+
import errno
|
|
10
|
+
import glob
|
|
11
|
+
@@ -374,7 +378,8 @@
|
|
12
|
+
next_line = next_line.decode("utf-8", "replace")
|
|
13
|
+
if next_line == "":
|
|
14
|
+
break
|
|
15
|
+
- next_line = next_line.rstrip("\r\n")
|
|
16
|
+
+ if next_line.endswith("\n"):
|
|
17
|
+
+ next_line = next_line[:-1]
|
|
18
|
+
|
|
19
|
+
if next_line.startswith(ray_constants.LOG_PREFIX_ACTOR_NAME):
|
|
20
|
+
flush() # Possible change of task/actor name.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
--- a/resource_demand_scheduler.py
|
|
2
|
+
+++ b/resource_demand_scheduler.py
|
|
3
|
+
@@ -1,3 +1,8 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/resource_demand_scheduler.py
|
|
5
|
+
+# Sky patch changes:
|
|
6
|
+
+# - no new nodes are allowed to be launched launched when the upscaling_speed is 0
|
|
7
|
+
+# - comment out "assert not unfulfilled": this seems a buggy assert
|
|
8
|
+
+
|
|
9
|
+
"""Implements multi-node-type autoscaling.
|
|
10
|
+
|
|
11
|
+
This file implements an autoscaling algorithm that is aware of multiple node
|
|
12
|
+
@@ -448,7 +453,10 @@
|
|
13
|
+
+ placement_group_nodes.get(node_type, 0),
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
- if upper_bound > 0:
|
|
17
|
+
+ # NOTE(sky): do not autoscale when upsclaing speed is 0.
|
|
18
|
+
+ if self.upscaling_speed == 0:
|
|
19
|
+
+ upper_bound = 0
|
|
20
|
+
+ if upper_bound >= 0:
|
|
21
|
+
updated_nodes_to_launch[node_type] = min(
|
|
22
|
+
upper_bound, to_launch[node_type]
|
|
23
|
+
)
|
|
24
|
+
@@ -592,7 +600,7 @@
|
|
25
|
+
unfulfilled, including_reserved = get_bin_pack_residual(
|
|
26
|
+
new_node_resources, unfulfilled, strict_spread=True
|
|
27
|
+
)
|
|
28
|
+
- assert not unfulfilled
|
|
29
|
+
+ # assert not unfulfilled # NOTE(sky): buggy assert.
|
|
30
|
+
node_resources += including_reserved
|
|
31
|
+
return to_add, node_resources, node_type_counts
|
|
32
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
--- a/updater.py
|
|
2
|
+
+++ b/updater.py
|
|
3
|
+
@@ -1,3 +1,7 @@
|
|
4
|
+
+# From https://github.com/ray-project/ray/blob/releases/2.9.3/python/ray/autoscaler/_private/updater.py
|
|
5
|
+
+# Sky patch changes:
|
|
6
|
+
+# - Ensure the node state is refreshed before checking the node is terminated.
|
|
7
|
+
+
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import subprocess
|
|
11
|
+
@@ -325,6 +329,7 @@
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
time.sleep(READY_CHECK_INTERVAL)
|
|
15
|
+
+ self.provider.non_terminated_nodes({})
|
|
16
|
+
|
|
17
|
+
def do_update(self):
|
|
18
|
+
self.provider.set_node_tags(
|