skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/logs/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Sky logging agents."""
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from sky import exceptions
|
|
5
|
+
from sky import skypilot_config
|
|
6
|
+
from sky.logs.agent import LoggingAgent
|
|
7
|
+
from sky.logs.aws import CloudwatchLoggingAgent
|
|
8
|
+
from sky.logs.gcp import GCPLoggingAgent
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_logging_agent() -> Optional[LoggingAgent]:
|
|
12
|
+
store = skypilot_config.get_nested(('logs', 'store'), None)
|
|
13
|
+
if store is None:
|
|
14
|
+
return None
|
|
15
|
+
if store == 'gcp':
|
|
16
|
+
return GCPLoggingAgent(skypilot_config.get_nested(('logs', 'gcp'), {}))
|
|
17
|
+
elif store == 'aws':
|
|
18
|
+
return CloudwatchLoggingAgent(
|
|
19
|
+
skypilot_config.get_nested(('logs', 'aws'), {}))
|
|
20
|
+
raise exceptions.InvalidSkyPilotConfigError(
|
|
21
|
+
f'Invalid logging store: {store}')
|
sky/logs/agent.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Base class for all logging agents."""
|
|
2
|
+
import abc
|
|
3
|
+
import os
|
|
4
|
+
import shlex
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
|
|
7
|
+
from sky.skylet import constants
|
|
8
|
+
from sky.utils import resources_utils
|
|
9
|
+
from sky.utils import yaml_utils
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LoggingAgent(abc.ABC):
|
|
13
|
+
"""Base class for all logging agents.
|
|
14
|
+
|
|
15
|
+
Each agent should implement the `get_setup_command` and
|
|
16
|
+
`get_credential_file_mounts` methods to return the setup command and
|
|
17
|
+
credential file mounts for the agent for provisioner to setup the agent
|
|
18
|
+
on each node.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
@abc.abstractmethod
|
|
22
|
+
def get_setup_command(self,
|
|
23
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
@abc.abstractmethod
|
|
27
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class FluentbitAgent(LoggingAgent):
|
|
32
|
+
"""Base class for logging store that use fluentbit as the agent."""
|
|
33
|
+
|
|
34
|
+
def get_setup_command(self,
|
|
35
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
36
|
+
install_cmd = (
|
|
37
|
+
# pylint: disable=line-too-long
|
|
38
|
+
'if ! command -v fluent-bit >/dev/null 2>&1 && [ ! -f /opt/fluent-bit/bin/fluent-bit ]; then '
|
|
39
|
+
'sudo apt-get update; sudo apt-get install -y gnupg; '
|
|
40
|
+
# pylint: disable=line-too-long
|
|
41
|
+
'sudo sh -c \'curl -L https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
|
|
42
|
+
# pylint: disable=line-too-long
|
|
43
|
+
'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
|
|
44
|
+
# pylint: disable=line-too-long
|
|
45
|
+
'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
|
|
46
|
+
# pylint: disable=line-too-long
|
|
47
|
+
'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
|
|
48
|
+
'sudo apt-get update; '
|
|
49
|
+
'sudo apt-get install -y fluent-bit; '
|
|
50
|
+
'fi')
|
|
51
|
+
cfg = self.fluentbit_config(cluster_name)
|
|
52
|
+
cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
|
|
53
|
+
config_cmd = (f'mkdir -p {constants.LOGGING_CONFIG_DIR} && '
|
|
54
|
+
f'echo {shlex.quote(cfg)} > {cfg_path}')
|
|
55
|
+
kill_prior_cmd = (
|
|
56
|
+
'if [ -f "/tmp/fluentbit.pid" ]; then '
|
|
57
|
+
# pylint: disable=line-too-long
|
|
58
|
+
'echo "Killing prior fluent-bit process $(cat /tmp/fluentbit.pid)"; '
|
|
59
|
+
'kill "$(cat /tmp/fluentbit.pid)" || true; '
|
|
60
|
+
'fi')
|
|
61
|
+
start_cmd = ('nohup $(command -v fluent-bit || '
|
|
62
|
+
'echo "/opt/fluent-bit/bin/fluent-bit") '
|
|
63
|
+
f'-c {cfg_path} > /tmp/fluentbit.log 2>&1 & '
|
|
64
|
+
'echo $! > /tmp/fluentbit.pid')
|
|
65
|
+
return ('set -e; '
|
|
66
|
+
f'{install_cmd}; '
|
|
67
|
+
f'{config_cmd}; '
|
|
68
|
+
f'{kill_prior_cmd}; '
|
|
69
|
+
f'{start_cmd}')
|
|
70
|
+
|
|
71
|
+
def fluentbit_config(self,
|
|
72
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
73
|
+
cfg_dict = {
|
|
74
|
+
'parsers': [{
|
|
75
|
+
'name': 'sky-ray-parser',
|
|
76
|
+
'format': 'regex',
|
|
77
|
+
# pylint: disable=line-too-long
|
|
78
|
+
'regex': r'(?:\x1b\[[\d;]+m)?\((?<worker_name>[^,]+)(?:,\s*rank=(?<rank>\d+))?(?:,\s*pid=(?<pid>\d+))(?:,\s*ip=(?<ip>[\d.]+))?\)(?:\x1b\[[\d;]+m)?\s*(?<log_line>.*)',
|
|
79
|
+
'types': 'rank:integer pid:integer',
|
|
80
|
+
}],
|
|
81
|
+
'pipeline': {
|
|
82
|
+
'inputs': [{
|
|
83
|
+
'name': 'tail',
|
|
84
|
+
'path': f'{constants.SKY_LOGS_DIRECTORY}/*/*.log',
|
|
85
|
+
'path_key': 'log_path',
|
|
86
|
+
# Shorten the refresh interval from 60s to 1s since every
|
|
87
|
+
# job creates a new log file and we must be responsive
|
|
88
|
+
# for this: the VM might be autodown within a minute
|
|
89
|
+
# right after the job completion.
|
|
90
|
+
'refresh_interval': 1,
|
|
91
|
+
}],
|
|
92
|
+
'filters': [{
|
|
93
|
+
'name': 'parser',
|
|
94
|
+
'match': '*',
|
|
95
|
+
'key_name': 'log',
|
|
96
|
+
'parser': 'sky-ray-parser',
|
|
97
|
+
'preserve_key': 'on', # preserve field for backwards compat
|
|
98
|
+
'reserve_data': 'on',
|
|
99
|
+
}],
|
|
100
|
+
'outputs': [self.fluentbit_output_config(cluster_name)],
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
104
|
+
|
|
105
|
+
@abc.abstractmethod
|
|
106
|
+
def fluentbit_output_config(
|
|
107
|
+
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
|
108
|
+
pass
|
sky/logs/aws.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""AWS CloudWatch logging agent."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
|
|
7
|
+
from sky.logs.agent import FluentbitAgent
|
|
8
|
+
from sky.utils import resources_utils
|
|
9
|
+
from sky.utils import yaml_utils
|
|
10
|
+
|
|
11
|
+
EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _CloudwatchLoggingConfig(pydantic.BaseModel):
|
|
15
|
+
"""Configuration for AWS CloudWatch logging agent."""
|
|
16
|
+
region: Optional[str] = None
|
|
17
|
+
credentials_file: Optional[str] = None
|
|
18
|
+
log_group_name: str = 'skypilot-logs'
|
|
19
|
+
log_stream_prefix: str = 'skypilot-'
|
|
20
|
+
auto_create_group: bool = True
|
|
21
|
+
additional_tags: Optional[Dict[str, str]] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class _CloudWatchOutputConfig(pydantic.BaseModel):
|
|
25
|
+
"""Auxiliary model for building CloudWatch output config in YAML.
|
|
26
|
+
|
|
27
|
+
Ref: https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
|
|
28
|
+
"""
|
|
29
|
+
name: str = 'cloudwatch_logs'
|
|
30
|
+
match: str = '*'
|
|
31
|
+
region: Optional[str] = None
|
|
32
|
+
log_group_name: Optional[str] = None
|
|
33
|
+
log_stream_prefix: Optional[str] = None
|
|
34
|
+
auto_create_group: bool = True
|
|
35
|
+
additional_tags: Optional[Dict[str, str]] = None
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
38
|
+
config = self.model_dump(exclude_none=True)
|
|
39
|
+
if 'auto_create_group' in config:
|
|
40
|
+
config['auto_create_group'] = 'true' if config[
|
|
41
|
+
'auto_create_group'] else 'false'
|
|
42
|
+
return config
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class CloudwatchLoggingAgent(FluentbitAgent):
|
|
46
|
+
"""AWS CloudWatch logging agent.
|
|
47
|
+
|
|
48
|
+
This agent forwards logs from SkyPilot clusters to AWS CloudWatch using
|
|
49
|
+
Fluent Bit. It supports authentication via IAM roles (preferred), AWS
|
|
50
|
+
credentials file, or environment variables.
|
|
51
|
+
|
|
52
|
+
Example configuration:
|
|
53
|
+
```yaml
|
|
54
|
+
logs:
|
|
55
|
+
store: aws
|
|
56
|
+
aws:
|
|
57
|
+
region: us-west-2
|
|
58
|
+
log_group_name: skypilot-logs
|
|
59
|
+
log_stream_prefix: my-cluster-
|
|
60
|
+
auto_create_group: true
|
|
61
|
+
```
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def __init__(self, config: Dict[str, Any]):
|
|
65
|
+
"""Initialize the CloudWatch logging agent.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
config: The configuration for the CloudWatch logging agent.
|
|
69
|
+
See the class docstring for the expected format.
|
|
70
|
+
"""
|
|
71
|
+
self.config = _CloudwatchLoggingConfig(**config)
|
|
72
|
+
super().__init__()
|
|
73
|
+
|
|
74
|
+
def get_setup_command(self,
|
|
75
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
76
|
+
"""Get the command to set up the CloudWatch logging agent.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
cluster_name: The name of the cluster.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
The command to set up the CloudWatch logging agent.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
if self.config.credentials_file:
|
|
86
|
+
credential_path = self.config.credentials_file
|
|
87
|
+
|
|
88
|
+
# Set AWS credentials and check whether credentials are valid.
|
|
89
|
+
# CloudWatch plugin supports IAM roles, credentials file, and
|
|
90
|
+
# environment variables. We prefer IAM roles when available
|
|
91
|
+
# (on EC2 instances). If credentials file is provided, we use
|
|
92
|
+
# it. Otherwise, we check if credentials are available in
|
|
93
|
+
# the environment.
|
|
94
|
+
pre_cmd = ''
|
|
95
|
+
if self.config.credentials_file:
|
|
96
|
+
pre_cmd = (
|
|
97
|
+
f'export AWS_SHARED_CREDENTIALS_FILE={credential_path}; '
|
|
98
|
+
f'if [ ! -f {credential_path} ]; then '
|
|
99
|
+
f'echo "ERROR: AWS credentials file {credential_path} '
|
|
100
|
+
f'not found. Please check if the file exists and is '
|
|
101
|
+
f'accessible." && exit 1; '
|
|
102
|
+
f'fi; '
|
|
103
|
+
f'if ! grep -q "\\[.*\\]" {credential_path} || '
|
|
104
|
+
f'! grep -q "aws_access_key_id" {credential_path}; then '
|
|
105
|
+
f'echo "ERROR: AWS credentials file {credential_path} is '
|
|
106
|
+
f'invalid. It should contain a profile section '
|
|
107
|
+
f'[profile_name] and aws_access_key_id." && exit 1; '
|
|
108
|
+
f'fi;')
|
|
109
|
+
else:
|
|
110
|
+
# Check if we're running on EC2 with an IAM role or if
|
|
111
|
+
# AWS credentials are available in the environment
|
|
112
|
+
pre_cmd = (
|
|
113
|
+
f'if ! curl -s -m 1 {EC2_MD_URL}'
|
|
114
|
+
'latest/meta-data/iam/security-credentials/ > /dev/null; '
|
|
115
|
+
'then '
|
|
116
|
+
# failed EC2 check, look for env vars
|
|
117
|
+
'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
|
|
118
|
+
'[ -z "$AWS_SECRET_ACCESS_KEY" ]; then '
|
|
119
|
+
'echo "ERROR: AWS CloudWatch logging configuration error. '
|
|
120
|
+
'Not running on EC2 with IAM role and AWS credentials not '
|
|
121
|
+
'found in environment. Please do one of the following: '
|
|
122
|
+
'1. Run on an EC2 instance with an IAM role that has '
|
|
123
|
+
'CloudWatch permissions, 2. Set AWS_ACCESS_KEY_ID and '
|
|
124
|
+
'AWS_SECRET_ACCESS_KEY environment variables, or '
|
|
125
|
+
'3. Provide a credentials file via logs.aws.credentials_file '
|
|
126
|
+
'in SkyPilot config." && exit 1; '
|
|
127
|
+
'fi; '
|
|
128
|
+
'fi;')
|
|
129
|
+
|
|
130
|
+
# If region is specified, set it in the environment
|
|
131
|
+
if self.config.region:
|
|
132
|
+
pre_cmd += (f' export AWS_REGION={self.config.region}'
|
|
133
|
+
f' AWS_DEFAULT_REGION={self.config.region};'
|
|
134
|
+
' command -v aws &>/dev/null && '
|
|
135
|
+
f'aws configure set region {self.config.region};')
|
|
136
|
+
else:
|
|
137
|
+
# If region is not specified, check if it's available in
|
|
138
|
+
# the environment or credentials file
|
|
139
|
+
pre_cmd += (
|
|
140
|
+
' if [ -z "$AWS_REGION" ] && '
|
|
141
|
+
'[ -z "$AWS_DEFAULT_REGION" ]; then '
|
|
142
|
+
'echo "WARNING: AWS region not specified in configuration or '
|
|
143
|
+
'environment. CloudWatch logging may fail if the region '
|
|
144
|
+
'cannot be determined. Consider setting logs.aws.region in '
|
|
145
|
+
'SkyPilot config."; '
|
|
146
|
+
'fi; ')
|
|
147
|
+
|
|
148
|
+
# Add a test command to verify AWS credentials work with CloudWatch
|
|
149
|
+
pre_cmd += (
|
|
150
|
+
' echo "Verifying AWS CloudWatch access..."; '
|
|
151
|
+
'if command -v aws > /dev/null; then '
|
|
152
|
+
'aws cloudwatch list-metrics --namespace AWS/Logs --max-items 1 '
|
|
153
|
+
'> /dev/null 2>&1 || '
|
|
154
|
+
'{ echo "ERROR: Failed to access AWS CloudWatch. Please check '
|
|
155
|
+
'your credentials and permissions."; '
|
|
156
|
+
'echo "The IAM role or user must have cloudwatch:ListMetrics '
|
|
157
|
+
'and logs:* permissions."; '
|
|
158
|
+
'exit 1; }; '
|
|
159
|
+
'else echo "AWS CLI not installed, skipping CloudWatch access '
|
|
160
|
+
'verification."; '
|
|
161
|
+
'fi; ')
|
|
162
|
+
|
|
163
|
+
return pre_cmd + ' ' + super().get_setup_command(cluster_name)
|
|
164
|
+
|
|
165
|
+
def fluentbit_config(self,
|
|
166
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
167
|
+
"""Get the Fluent Bit configuration for CloudWatch.
|
|
168
|
+
|
|
169
|
+
This overrides the base method to add a fallback output for local file
|
|
170
|
+
logging in case CloudWatch logging fails.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
cluster_name: The name of the cluster.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
The Fluent Bit configuration as a YAML string.
|
|
177
|
+
"""
|
|
178
|
+
cfg_dict = yaml_utils.read_yaml_str(
|
|
179
|
+
super().fluentbit_config(cluster_name))
|
|
180
|
+
display_name = cluster_name.display_name
|
|
181
|
+
unique_name = cluster_name.name_on_cloud
|
|
182
|
+
# Build tags for the log stream
|
|
183
|
+
tags = {
|
|
184
|
+
'skypilot.cluster_name': display_name,
|
|
185
|
+
'skypilot.cluster_id': unique_name,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
# Add additional tags if provided
|
|
189
|
+
if self.config.additional_tags:
|
|
190
|
+
tags.update(self.config.additional_tags)
|
|
191
|
+
|
|
192
|
+
log_processors = []
|
|
193
|
+
for key, value in tags.items():
|
|
194
|
+
log_processors.append({
|
|
195
|
+
'name': 'content_modifier',
|
|
196
|
+
'action': 'upsert',
|
|
197
|
+
'key': key,
|
|
198
|
+
'value': value
|
|
199
|
+
})
|
|
200
|
+
|
|
201
|
+
# Add log processors to config
|
|
202
|
+
processors_config = cfg_dict['pipeline']['inputs'][0].get(
|
|
203
|
+
'processors', {})
|
|
204
|
+
processors_logs_config = processors_config.get('logs', [])
|
|
205
|
+
processors_logs_config.extend(log_processors)
|
|
206
|
+
processors_config['logs'] = processors_logs_config
|
|
207
|
+
cfg_dict['pipeline']['inputs'][0]['processors'] = processors_config
|
|
208
|
+
|
|
209
|
+
return yaml_utils.dump_yaml_str(cfg_dict)
|
|
210
|
+
|
|
211
|
+
def fluentbit_output_config(
|
|
212
|
+
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
|
213
|
+
"""Get the Fluent Bit output configuration for CloudWatch.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
cluster_name: The name of the cluster.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
The Fluent Bit output configuration for CloudWatch.
|
|
220
|
+
"""
|
|
221
|
+
unique_name = cluster_name.name_on_cloud
|
|
222
|
+
|
|
223
|
+
# Format the log stream name to include cluster information
|
|
224
|
+
# This helps with identifying logs in CloudWatch
|
|
225
|
+
log_stream_prefix = f'{self.config.log_stream_prefix}{unique_name}-'
|
|
226
|
+
|
|
227
|
+
# Create the CloudWatch output configuration with error handling options
|
|
228
|
+
return _CloudWatchOutputConfig(
|
|
229
|
+
region=self.config.region,
|
|
230
|
+
log_group_name=self.config.log_group_name,
|
|
231
|
+
log_stream_prefix=log_stream_prefix,
|
|
232
|
+
auto_create_group=self.config.auto_create_group,
|
|
233
|
+
).to_dict()
|
|
234
|
+
|
|
235
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
236
|
+
"""Get the credential file mounts for the CloudWatch logging agent.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
A dictionary mapping local credential file paths to remote paths.
|
|
240
|
+
"""
|
|
241
|
+
if self.config.credentials_file:
|
|
242
|
+
return {self.config.credentials_file: self.config.credentials_file}
|
|
243
|
+
return {}
|
sky/logs/gcp.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""GCP logging agent."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
|
|
7
|
+
from sky.clouds import gcp
|
|
8
|
+
from sky.logs.agent import FluentbitAgent
|
|
9
|
+
from sky.utils import resources_utils
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class _GCPLoggingConfig(pydantic.BaseModel):
|
|
13
|
+
"""Configuration for GCP logging agent."""
|
|
14
|
+
project_id: Optional[str] = None
|
|
15
|
+
credentials_file: Optional[str] = None
|
|
16
|
+
additional_labels: Optional[Dict[str, str]] = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class _StackdriverOutputConfig(pydantic.BaseModel):
|
|
20
|
+
"""Auxiliary model for building stackdriver output config in YAML.
|
|
21
|
+
|
|
22
|
+
Ref: https://docs.fluentbit.io/manual/1.7/pipeline/outputs/stackdriver
|
|
23
|
+
"""
|
|
24
|
+
name: str = 'stackdriver'
|
|
25
|
+
match: str = '*'
|
|
26
|
+
export_to_project_id: Optional[str] = None
|
|
27
|
+
labels: Optional[Dict[str, str]] = None
|
|
28
|
+
|
|
29
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
30
|
+
config = self.model_dump(exclude_none=True)
|
|
31
|
+
if self.labels:
|
|
32
|
+
# Replace the label format from `{k: v}` to `k=v`
|
|
33
|
+
label_str = ','.join([f'{k}={v}' for k, v in self.labels.items()])
|
|
34
|
+
config['labels'] = label_str
|
|
35
|
+
return config
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GCPLoggingAgent(FluentbitAgent):
|
|
39
|
+
"""GCP logging agent."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, config: Dict[str, Any]):
|
|
42
|
+
self.config = _GCPLoggingConfig(**config)
|
|
43
|
+
|
|
44
|
+
def get_setup_command(self,
|
|
45
|
+
cluster_name: resources_utils.ClusterName) -> str:
|
|
46
|
+
credential_path = gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
|
|
47
|
+
if self.config.credentials_file:
|
|
48
|
+
credential_path = self.config.credentials_file
|
|
49
|
+
# Set GOOGLE_APPLICATION_CREDENTIALS and check whether credentials
|
|
50
|
+
# is valid.
|
|
51
|
+
# Stackdriver only support service account credentials or credentials
|
|
52
|
+
# from metadata server (only available on GCE or GKE). If the default
|
|
53
|
+
# credentials uploaded by API server is NOT a service account key and
|
|
54
|
+
# there is NO metadata server available, the logging agent will fail to
|
|
55
|
+
# authenticate and we require the user to upload a service account key
|
|
56
|
+
# via logs.gcp.credentials_file in this case.
|
|
57
|
+
# Also note that we use env var instead of YAML config to specify the
|
|
58
|
+
# service account key file path in order to resolve the home directory
|
|
59
|
+
# more reliably.
|
|
60
|
+
# Ref: https://github.com/fluent/fluent-bit/issues/8804
|
|
61
|
+
# TODO(aylei): check whether the credentials config is valid before
|
|
62
|
+
# provision.
|
|
63
|
+
pre_cmd = (f'export GOOGLE_APPLICATION_CREDENTIALS={credential_path}; '
|
|
64
|
+
f'cat {credential_path} | grep "service_account" || '
|
|
65
|
+
f'(echo "Credentials file {credential_path} is not a '
|
|
66
|
+
'service account key, check metadata server" && '
|
|
67
|
+
'curl -s http://metadata.google.internal >/dev/null || '
|
|
68
|
+
f'(echo "Neither service account key nor metadata server is '
|
|
69
|
+
'available. Set logs.gcp.credentials_file to a service '
|
|
70
|
+
'account key in server config and retry." && '
|
|
71
|
+
'exit 1;))')
|
|
72
|
+
return pre_cmd + ' && ' + super().get_setup_command(cluster_name)
|
|
73
|
+
|
|
74
|
+
def fluentbit_output_config(
|
|
75
|
+
self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
|
|
76
|
+
display_name = cluster_name.display_name
|
|
77
|
+
unique_name = cluster_name.name_on_cloud
|
|
78
|
+
|
|
79
|
+
return _StackdriverOutputConfig(
|
|
80
|
+
export_to_project_id=self.config.project_id,
|
|
81
|
+
labels={
|
|
82
|
+
'skypilot_cluster_name': display_name,
|
|
83
|
+
'skypilot_cluster_id': unique_name,
|
|
84
|
+
**(self.config.additional_labels or {})
|
|
85
|
+
},
|
|
86
|
+
).to_dict()
|
|
87
|
+
|
|
88
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
89
|
+
if self.config.credentials_file:
|
|
90
|
+
return {self.config.credentials_file: self.config.credentials_file}
|
|
91
|
+
return {}
|
sky/metrics/__init__.py
ADDED
|
File without changes
|