skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/skylet/log_lib.py
CHANGED
|
@@ -8,11 +8,13 @@ import functools
|
|
|
8
8
|
import io
|
|
9
9
|
import multiprocessing.pool
|
|
10
10
|
import os
|
|
11
|
+
import queue as queue_lib
|
|
11
12
|
import shlex
|
|
12
13
|
import subprocess
|
|
13
14
|
import sys
|
|
14
15
|
import tempfile
|
|
15
16
|
import textwrap
|
|
17
|
+
import threading
|
|
16
18
|
import time
|
|
17
19
|
from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
|
|
18
20
|
Tuple, Union)
|
|
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
|
|
|
39
41
|
|
|
40
42
|
LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
|
|
41
43
|
|
|
44
|
+
# 16-64KiB seems to be the sweet spot:
|
|
45
|
+
# https://github.com/grpc/grpc.github.io/issues/371
|
|
46
|
+
# TODO(kevin): Benchmark this ourselves and verify.
|
|
47
|
+
DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
|
|
48
|
+
|
|
42
49
|
|
|
43
50
|
class _ProcessingArgs:
|
|
44
51
|
"""Arguments for processing logs."""
|
|
@@ -165,7 +172,7 @@ def run_with_log(
|
|
|
165
172
|
streaming_prefix: Optional[str] = None,
|
|
166
173
|
log_cmd: bool = False,
|
|
167
174
|
**kwargs,
|
|
168
|
-
) -> Union[int, Tuple[int, str, str]]:
|
|
175
|
+
) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
|
|
169
176
|
"""Runs a command and logs its output to a file.
|
|
170
177
|
|
|
171
178
|
Args:
|
|
@@ -176,6 +183,8 @@ def run_with_log(
|
|
|
176
183
|
process_stream: Whether to post-process the stdout/stderr of the
|
|
177
184
|
command, such as replacing or skipping lines on the fly. If
|
|
178
185
|
enabled, lines are printed only when '\r' or '\n' is found.
|
|
186
|
+
streaming_prefix: Optional prefix for each log line. Can contain {pid}
|
|
187
|
+
placeholder which will be replaced with the subprocess PID.
|
|
179
188
|
|
|
180
189
|
Returns the returncode or returncode, stdout and stderr of the command.
|
|
181
190
|
Note that the stdout and stderr is already decoded.
|
|
@@ -213,7 +222,21 @@ def run_with_log(
|
|
|
213
222
|
stdin=stdin,
|
|
214
223
|
**kwargs) as proc:
|
|
215
224
|
try:
|
|
216
|
-
|
|
225
|
+
if ctx is not None:
|
|
226
|
+
# When runs in coroutine, use kill_pg if available to avoid
|
|
227
|
+
# the overhead of refreshing the process tree in the daemon.
|
|
228
|
+
subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
|
|
229
|
+
else:
|
|
230
|
+
# For backward compatibility, do not specify use_kill_pg by
|
|
231
|
+
# default.
|
|
232
|
+
subprocess_utils.kill_process_daemon(proc.pid)
|
|
233
|
+
|
|
234
|
+
# Format streaming_prefix with subprocess PID if it contains {pid}
|
|
235
|
+
formatted_streaming_prefix = streaming_prefix
|
|
236
|
+
if streaming_prefix and '{pid}' in streaming_prefix:
|
|
237
|
+
formatted_streaming_prefix = streaming_prefix.format(
|
|
238
|
+
pid=proc.pid)
|
|
239
|
+
|
|
217
240
|
stdout = ''
|
|
218
241
|
stderr = ''
|
|
219
242
|
stdout_stream_handler = None
|
|
@@ -242,7 +265,7 @@ def run_with_log(
|
|
|
242
265
|
line_processor=line_processor,
|
|
243
266
|
# Replace CRLF when the output is logged to driver by ray.
|
|
244
267
|
replace_crlf=with_ray,
|
|
245
|
-
streaming_prefix=
|
|
268
|
+
streaming_prefix=formatted_streaming_prefix,
|
|
246
269
|
)
|
|
247
270
|
stdout_stream_handler = functools.partial(
|
|
248
271
|
_handle_io_stream,
|
|
@@ -264,7 +287,6 @@ def run_with_log(
|
|
|
264
287
|
stdout, stderr = context_utils.pipe_and_wait_process(
|
|
265
288
|
ctx,
|
|
266
289
|
proc,
|
|
267
|
-
cancel_callback=subprocess_utils.kill_children_processes,
|
|
268
290
|
stdout_stream_handler=stdout_stream_handler,
|
|
269
291
|
stderr_stream_handler=stderr_stream_handler)
|
|
270
292
|
elif process_stream:
|
|
@@ -336,7 +358,8 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
336
358
|
log_path: str,
|
|
337
359
|
env_vars: Optional[Dict[str, str]] = None,
|
|
338
360
|
stream_logs: bool = False,
|
|
339
|
-
with_ray: bool = False
|
|
361
|
+
with_ray: bool = False,
|
|
362
|
+
streaming_prefix: Optional[str] = None):
|
|
340
363
|
with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
|
|
341
364
|
delete=False) as fp:
|
|
342
365
|
bash_command = make_task_bash_script(bash_command, env_vars=env_vars)
|
|
@@ -351,9 +374,26 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
351
374
|
log_path,
|
|
352
375
|
stream_logs=stream_logs,
|
|
353
376
|
with_ray=with_ray,
|
|
377
|
+
streaming_prefix=streaming_prefix,
|
|
354
378
|
shell=True)
|
|
355
379
|
|
|
356
380
|
|
|
381
|
+
def run_bash_command_with_log_and_return_pid(
|
|
382
|
+
bash_command: str,
|
|
383
|
+
log_path: str,
|
|
384
|
+
env_vars: Optional[Dict[str, str]] = None,
|
|
385
|
+
stream_logs: bool = False,
|
|
386
|
+
with_ray: bool = False,
|
|
387
|
+
streaming_prefix: Optional[str] = None):
|
|
388
|
+
return_code = run_bash_command_with_log(bash_command,
|
|
389
|
+
log_path,
|
|
390
|
+
env_vars,
|
|
391
|
+
stream_logs,
|
|
392
|
+
with_ray,
|
|
393
|
+
streaming_prefix=streaming_prefix)
|
|
394
|
+
return {'return_code': return_code, 'pid': os.getpid()}
|
|
395
|
+
|
|
396
|
+
|
|
357
397
|
def _follow_job_logs(file,
|
|
358
398
|
job_id: int,
|
|
359
399
|
start_streaming: bool,
|
|
@@ -395,9 +435,9 @@ def _follow_job_logs(file,
|
|
|
395
435
|
wait_last_logs = False
|
|
396
436
|
continue
|
|
397
437
|
status_str = status.value if status is not None else 'None'
|
|
398
|
-
|
|
399
|
-
f'Job finished (status: {status_str}).')
|
|
400
|
-
|
|
438
|
+
finish = ux_utils.finishing_message(
|
|
439
|
+
f'Job finished (status: {status_str}).')
|
|
440
|
+
yield finish + '\n'
|
|
401
441
|
return
|
|
402
442
|
|
|
403
443
|
time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
|
|
@@ -552,3 +592,207 @@ def tail_logs(job_id: Optional[int],
|
|
|
552
592
|
except FileNotFoundError:
|
|
553
593
|
print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
554
594
|
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
598
|
+
log_dir: Optional[str],
|
|
599
|
+
managed_job_id: Optional[int] = None,
|
|
600
|
+
follow: bool = True,
|
|
601
|
+
tail: int = 0) -> Iterator[str]:
|
|
602
|
+
"""Tail the logs of a job. This is mostly the same as tail_logs, but
|
|
603
|
+
returns an iterator instead of printing to stdout/stderr."""
|
|
604
|
+
if job_id is None:
|
|
605
|
+
# This only happens when job_lib.get_latest_job_id() returns None,
|
|
606
|
+
# which means no job has been submitted to this cluster. See
|
|
607
|
+
# sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
|
|
608
|
+
logger.info('Skip streaming logs as no job has been submitted.')
|
|
609
|
+
return
|
|
610
|
+
job_str = f'job {job_id}'
|
|
611
|
+
if managed_job_id is not None:
|
|
612
|
+
job_str = f'managed job {managed_job_id}'
|
|
613
|
+
if log_dir is None:
|
|
614
|
+
msg = f'{job_str.capitalize()} not found (see `sky queue`).'
|
|
615
|
+
yield msg + '\n'
|
|
616
|
+
return
|
|
617
|
+
logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
|
|
618
|
+
f'{managed_job_id}.')
|
|
619
|
+
log_path = os.path.join(log_dir, 'run.log')
|
|
620
|
+
log_path = os.path.expanduser(log_path)
|
|
621
|
+
|
|
622
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
623
|
+
|
|
624
|
+
# Wait for the log to be written. This is needed due to the `ray submit`
|
|
625
|
+
# will take some time to start the job and write the log.
|
|
626
|
+
retry_cnt = 0
|
|
627
|
+
while status is not None and not status.is_terminal():
|
|
628
|
+
retry_cnt += 1
|
|
629
|
+
if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
|
|
630
|
+
break
|
|
631
|
+
if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
|
|
632
|
+
err = (f'{colorama.Fore.RED}ERROR: Logs for '
|
|
633
|
+
f'{job_str} (status: {status.value}) does not exist '
|
|
634
|
+
f'after retrying {retry_cnt} times.'
|
|
635
|
+
f'{colorama.Style.RESET_ALL}')
|
|
636
|
+
yield err + '\n'
|
|
637
|
+
return
|
|
638
|
+
waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
|
|
639
|
+
'to be written...')
|
|
640
|
+
yield waiting + '\n'
|
|
641
|
+
time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
|
|
642
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
643
|
+
|
|
644
|
+
start_stream_at = LOG_FILE_START_STREAMING_AT
|
|
645
|
+
# Explicitly declare the type to avoid mypy warning.
|
|
646
|
+
lines: Iterable[str] = []
|
|
647
|
+
if follow and status in [
|
|
648
|
+
job_lib.JobStatus.SETTING_UP,
|
|
649
|
+
job_lib.JobStatus.PENDING,
|
|
650
|
+
job_lib.JobStatus.RUNNING,
|
|
651
|
+
]:
|
|
652
|
+
# Not using `ray job logs` because it will put progress bar in
|
|
653
|
+
# multiple lines.
|
|
654
|
+
with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
|
|
655
|
+
# Using `_follow` instead of `tail -f` to streaming the whole
|
|
656
|
+
# log and creating a new process for tail.
|
|
657
|
+
start_streaming = False
|
|
658
|
+
if tail > 0:
|
|
659
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
660
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
661
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
662
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
663
|
+
for line in lines:
|
|
664
|
+
if start_stream_at in line:
|
|
665
|
+
start_streaming = True
|
|
666
|
+
if start_streaming:
|
|
667
|
+
yield line
|
|
668
|
+
# Now, the cursor is at the end of the last lines
|
|
669
|
+
# if tail > 0
|
|
670
|
+
for line in _follow_job_logs(log_file,
|
|
671
|
+
job_id=job_id,
|
|
672
|
+
start_streaming=start_streaming,
|
|
673
|
+
start_streaming_at=start_stream_at):
|
|
674
|
+
yield line
|
|
675
|
+
else:
|
|
676
|
+
try:
|
|
677
|
+
start_streaming = False
|
|
678
|
+
with open(log_path, 'r', encoding='utf-8') as log_file:
|
|
679
|
+
if tail > 0:
|
|
680
|
+
# If tail > 0, we need to read the last n lines.
|
|
681
|
+
# We use double ended queue to rotate the last n lines.
|
|
682
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
683
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
684
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
685
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
686
|
+
else:
|
|
687
|
+
lines = log_file
|
|
688
|
+
for line in lines:
|
|
689
|
+
if start_stream_at in line:
|
|
690
|
+
start_streaming = True
|
|
691
|
+
if start_streaming:
|
|
692
|
+
yield line
|
|
693
|
+
status_str = status.value if status is not None else 'None'
|
|
694
|
+
# Only show "Job finished" for actually terminal states
|
|
695
|
+
if status is not None and status.is_terminal():
|
|
696
|
+
finish = ux_utils.finishing_message(
|
|
697
|
+
f'Job finished (status: {status_str}).')
|
|
698
|
+
yield finish + '\n'
|
|
699
|
+
return
|
|
700
|
+
except FileNotFoundError:
|
|
701
|
+
err = (
|
|
702
|
+
f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
703
|
+
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
704
|
+
yield err + '\n'
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
class LogBuffer:
|
|
708
|
+
"""In-memory buffer for chunking log lines for streaming."""
|
|
709
|
+
|
|
710
|
+
def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
|
|
711
|
+
"""Initialize the log buffer.
|
|
712
|
+
|
|
713
|
+
Args:
|
|
714
|
+
max_chars: Maximum buffer size (in characters, not bytes) before
|
|
715
|
+
flushing. The actual amount of bytes (UTF-8 encoding)
|
|
716
|
+
could be more than this, depending on the characters,
|
|
717
|
+
i.e. ASCII characters take 1 byte, while others
|
|
718
|
+
may take 2-4 bytes. But this is fine as our default
|
|
719
|
+
chunk size is well below the default value of
|
|
720
|
+
grpc.max_receive_message_length which is 4MB.
|
|
721
|
+
"""
|
|
722
|
+
self.max_chars = max_chars
|
|
723
|
+
self._buffer = io.StringIO()
|
|
724
|
+
|
|
725
|
+
def _should_flush(self) -> bool:
|
|
726
|
+
return self._buffer.tell() >= self.max_chars
|
|
727
|
+
|
|
728
|
+
def flush(self) -> str:
|
|
729
|
+
"""Get the current buffered content and clear the buffer.
|
|
730
|
+
|
|
731
|
+
Returns:
|
|
732
|
+
The buffered log lines as a single string
|
|
733
|
+
"""
|
|
734
|
+
if not self._buffer.tell():
|
|
735
|
+
return ''
|
|
736
|
+
chunk = self._buffer.getvalue()
|
|
737
|
+
self._buffer.truncate(0)
|
|
738
|
+
self._buffer.seek(0)
|
|
739
|
+
return chunk
|
|
740
|
+
|
|
741
|
+
def write(self, line: str) -> bool:
|
|
742
|
+
"""Add a line to the buffer.
|
|
743
|
+
|
|
744
|
+
Args:
|
|
745
|
+
line: The log line to add
|
|
746
|
+
|
|
747
|
+
Returns:
|
|
748
|
+
True if buffer should be flushed after adding the line
|
|
749
|
+
"""
|
|
750
|
+
self._buffer.write(line)
|
|
751
|
+
return self._should_flush()
|
|
752
|
+
|
|
753
|
+
def close(self):
|
|
754
|
+
self._buffer.close()
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
758
|
+
timeout: float) -> Iterable[str]:
|
|
759
|
+
"""Iterates over an iterable, writing each item to a buffer,
|
|
760
|
+
and flushing the buffer when it is full or no item is
|
|
761
|
+
yielded within the timeout duration."""
|
|
762
|
+
# TODO(kevin): Simplify this using asyncio.timeout, once we move
|
|
763
|
+
# the skylet event loop and gRPC server to asyncio.
|
|
764
|
+
# https://docs.python.org/3/library/asyncio-task.html#timeouts
|
|
765
|
+
|
|
766
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
767
|
+
sentinel = object()
|
|
768
|
+
|
|
769
|
+
def producer():
|
|
770
|
+
try:
|
|
771
|
+
for item in iterable:
|
|
772
|
+
queue.put(item)
|
|
773
|
+
finally:
|
|
774
|
+
queue.put(sentinel)
|
|
775
|
+
|
|
776
|
+
thread = threading.Thread(target=producer, daemon=True)
|
|
777
|
+
thread.start()
|
|
778
|
+
|
|
779
|
+
while True:
|
|
780
|
+
try:
|
|
781
|
+
item = queue.get(timeout=timeout)
|
|
782
|
+
except queue_lib.Empty:
|
|
783
|
+
out = buffer.flush()
|
|
784
|
+
if out:
|
|
785
|
+
yield out
|
|
786
|
+
continue
|
|
787
|
+
|
|
788
|
+
if item is sentinel:
|
|
789
|
+
thread.join()
|
|
790
|
+
out = buffer.flush()
|
|
791
|
+
if out:
|
|
792
|
+
yield out
|
|
793
|
+
return
|
|
794
|
+
|
|
795
|
+
if buffer.write(item):
|
|
796
|
+
out = buffer.flush()
|
|
797
|
+
if out:
|
|
798
|
+
yield out
|
sky/skylet/log_lib.pyi
CHANGED
|
@@ -4,7 +4,7 @@ overloaded type hints for run_with_log(), as we need to determine
|
|
|
4
4
|
the return type based on the value of require_outputs.
|
|
5
5
|
"""
|
|
6
6
|
import typing
|
|
7
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
7
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
from typing_extensions import Literal
|
|
10
10
|
|
|
@@ -42,7 +42,7 @@ class _ProcessingArgs:
|
|
|
42
42
|
...
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def _get_context() -> Optional[context.
|
|
45
|
+
def _get_context() -> Optional[context.SkyPilotContext]:
|
|
46
46
|
...
|
|
47
47
|
|
|
48
48
|
|
|
@@ -68,7 +68,7 @@ def run_with_log(cmd: Union[List[str], str],
|
|
|
68
68
|
process_stream: bool = ...,
|
|
69
69
|
line_processor: Optional[log_utils.LineProcessor] = ...,
|
|
70
70
|
streaming_prefix: Optional[str] = ...,
|
|
71
|
-
|
|
71
|
+
log_cmd: bool = ...,
|
|
72
72
|
**kwargs) -> int:
|
|
73
73
|
...
|
|
74
74
|
|
|
@@ -87,7 +87,7 @@ def run_with_log(cmd: Union[List[str], str],
|
|
|
87
87
|
process_stream: bool = ...,
|
|
88
88
|
line_processor: Optional[log_utils.LineProcessor] = ...,
|
|
89
89
|
streaming_prefix: Optional[str] = ...,
|
|
90
|
-
|
|
90
|
+
log_cmd: bool = ...,
|
|
91
91
|
**kwargs) -> Tuple[int, str, str]:
|
|
92
92
|
...
|
|
93
93
|
|
|
@@ -106,8 +106,8 @@ def run_with_log(cmd: Union[List[str], str],
|
|
|
106
106
|
process_stream: bool = ...,
|
|
107
107
|
line_processor: Optional[log_utils.LineProcessor] = ...,
|
|
108
108
|
streaming_prefix: Optional[str] = ...,
|
|
109
|
-
|
|
110
|
-
**kwargs) ->
|
|
109
|
+
log_cmd: bool = ...,
|
|
110
|
+
**kwargs) -> Tuple[int, int]:
|
|
111
111
|
...
|
|
112
112
|
|
|
113
113
|
|
|
@@ -125,7 +125,18 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
125
125
|
log_path: str,
|
|
126
126
|
env_vars: Optional[Dict[str, str]] = ...,
|
|
127
127
|
stream_logs: bool = ...,
|
|
128
|
-
with_ray: bool =
|
|
128
|
+
with_ray: bool = ...,
|
|
129
|
+
streaming_prefix: Optional[str] = ...) -> int:
|
|
130
|
+
...
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def run_bash_command_with_log_and_return_pid(
|
|
134
|
+
bash_command: str,
|
|
135
|
+
log_path: str,
|
|
136
|
+
env_vars: Optional[Dict[str, str]] = ...,
|
|
137
|
+
stream_logs: bool = ...,
|
|
138
|
+
with_ray: bool = ...,
|
|
139
|
+
streaming_prefix: Optional[str] = ...) -> Dict[str, Union[int, str]]:
|
|
129
140
|
...
|
|
130
141
|
|
|
131
142
|
|
|
@@ -134,3 +145,32 @@ def tail_logs(job_id: int,
|
|
|
134
145
|
managed_job_id: Optional[int] = ...,
|
|
135
146
|
follow: bool = ...) -> None:
|
|
136
147
|
...
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
151
|
+
log_dir: Optional[str],
|
|
152
|
+
managed_job_id: Optional[int] = ...,
|
|
153
|
+
follow: bool = ...,
|
|
154
|
+
tail: int = ...) -> Iterator[str]:
|
|
155
|
+
...
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class LogBuffer:
|
|
159
|
+
max_chars: int
|
|
160
|
+
|
|
161
|
+
def __init__(self, max_chars: int = ...):
|
|
162
|
+
...
|
|
163
|
+
|
|
164
|
+
def flush(self) -> str:
|
|
165
|
+
...
|
|
166
|
+
|
|
167
|
+
def write(self, line: str) -> bool:
|
|
168
|
+
...
|
|
169
|
+
|
|
170
|
+
def close(self):
|
|
171
|
+
...
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
175
|
+
timeout: float) -> Iterable[str]:
|
|
176
|
+
...
|
|
@@ -24,7 +24,7 @@ import socket
|
|
|
24
24
|
import threading
|
|
25
25
|
import time
|
|
26
26
|
from pathlib import Path
|
|
27
|
-
from pprint import pprint
|
|
27
|
+
from pprint import pformat, pprint
|
|
28
28
|
from typing import Any, Dict, List, Optional
|
|
29
29
|
from uuid import uuid4
|
|
30
30
|
|
|
@@ -67,13 +67,13 @@ def log_in_out(func):
|
|
|
67
67
|
logger.debug(
|
|
68
68
|
f"\n\nEnter {name} from {inspect.stack()[0][3]} "
|
|
69
69
|
f"{inspect.stack()[1][3]} {inspect.stack()[2][3]} with args: "
|
|
70
|
-
f"entered with args:\n{
|
|
70
|
+
f"entered with args:\n{pformat(args)} and kwargs {pformat(kwargs)}"
|
|
71
71
|
)
|
|
72
72
|
try:
|
|
73
73
|
result = func(*args, **kwargs)
|
|
74
74
|
logger.debug(
|
|
75
75
|
f"Leave {name} from {inspect.stack()[1][3]} with result "
|
|
76
|
-
f"Func Result:{
|
|
76
|
+
f"Func Result:{pformat(result)}\n\n"
|
|
77
77
|
)
|
|
78
78
|
except Exception:
|
|
79
79
|
cli_logger.error(f"Error in {name}")
|
|
@@ -445,7 +445,7 @@ class IBMVPCNodeProvider(NodeProvider):
|
|
|
445
445
|
"""returns the worker's node private ip address"""
|
|
446
446
|
node = self._get_cached_node(node_id)
|
|
447
447
|
|
|
448
|
-
# if a bug
|
|
448
|
+
# if a bug occurred, or node data was fetched before primary_ip
|
|
449
449
|
# was assigned, refetch node data from cloud.
|
|
450
450
|
try:
|
|
451
451
|
primary_ip = node["network_interfaces"][0].get("primary_ip")["address"]
|
|
@@ -502,8 +502,12 @@ class IBMVPCNodeProvider(NodeProvider):
|
|
|
502
502
|
|
|
503
503
|
logger.info(f"Creating new VM instance {name}")
|
|
504
504
|
|
|
505
|
-
|
|
506
|
-
|
|
505
|
+
if self.vpc_tags is None:
|
|
506
|
+
raise ValueError("vpc_tags must be initialized before creating instances")
|
|
507
|
+
vpc_tags = self.vpc_tags # Help mypy with type narrowing
|
|
508
|
+
|
|
509
|
+
security_group_identity_model = {"id": vpc_tags["security_group_id"]}
|
|
510
|
+
subnet_identity_model = {"id": vpc_tags["subnet_id"]}
|
|
507
511
|
primary_network_interface = {
|
|
508
512
|
"name": "eth0",
|
|
509
513
|
"subnet": subnet_identity_model,
|
|
@@ -536,7 +540,7 @@ class IBMVPCNodeProvider(NodeProvider):
|
|
|
536
540
|
instance_prototype["keys"] = [key_identity_model]
|
|
537
541
|
instance_prototype["profile"] = {"name": profile_name}
|
|
538
542
|
instance_prototype["resource_group"] = {"id": self.resource_group_id}
|
|
539
|
-
instance_prototype["vpc"] = {"id":
|
|
543
|
+
instance_prototype["vpc"] = {"id": vpc_tags["vpc_id"]}
|
|
540
544
|
instance_prototype["image"] = {"id": base_config["image_id"]}
|
|
541
545
|
|
|
542
546
|
instance_prototype["zone"] = {"name": self.zone}
|
|
@@ -584,7 +588,7 @@ class IBMVPCNodeProvider(NodeProvider):
|
|
|
584
588
|
floating_ip_name = f"{RAY_RECYCLABLE}-{uuid4().hex[:4]}"
|
|
585
589
|
# create a new floating ip
|
|
586
590
|
logger.debug(f"Creating floating IP {floating_ip_name}")
|
|
587
|
-
floating_ip_prototype = {}
|
|
591
|
+
floating_ip_prototype: Dict[str, Any] = {}
|
|
588
592
|
floating_ip_prototype["name"] = floating_ip_name
|
|
589
593
|
floating_ip_prototype["zone"] = {"name": self.zone}
|
|
590
594
|
floating_ip_prototype["resource_group"] = {"id": self.resource_group_id}
|
|
@@ -10,6 +10,7 @@ import textwrap
|
|
|
10
10
|
import time
|
|
11
11
|
import uuid
|
|
12
12
|
from concurrent.futures import ThreadPoolExecutor
|
|
13
|
+
from typing import Any, Dict
|
|
13
14
|
|
|
14
15
|
import requests
|
|
15
16
|
|
|
@@ -173,7 +174,7 @@ class IBMVPCProvider:
|
|
|
173
174
|
"a subnet"
|
|
174
175
|
)
|
|
175
176
|
|
|
176
|
-
subnet_prototype = {}
|
|
177
|
+
subnet_prototype: Dict[str, Any] = {}
|
|
177
178
|
subnet_prototype["zone"] = {"name": zone_name}
|
|
178
179
|
subnet_prototype["ip_version"] = "ipv4"
|
|
179
180
|
subnet_prototype["name"] = subnet_name
|
|
@@ -186,7 +187,7 @@ class IBMVPCProvider:
|
|
|
186
187
|
|
|
187
188
|
def create_public_gateway(self, vpc_id, zone_name, subnet_data):
|
|
188
189
|
|
|
189
|
-
gateway_prototype = {}
|
|
190
|
+
gateway_prototype: Dict[str, Any] = {}
|
|
190
191
|
gateway_prototype["vpc"] = {"id": vpc_id}
|
|
191
192
|
gateway_prototype["zone"] = {"name": zone_name}
|
|
192
193
|
gateway_prototype["name"] = f"{subnet_data['name']}-gw"
|
|
@@ -345,7 +346,7 @@ class IBMVPCProvider:
|
|
|
345
346
|
return True
|
|
346
347
|
tries -= 1
|
|
347
348
|
time.sleep(sleep_interval)
|
|
348
|
-
logger.error("Failed to delete instance within the
|
|
349
|
+
logger.error("Failed to delete instance within the allotted time\n")
|
|
349
350
|
return False
|
|
350
351
|
|
|
351
352
|
for subnet_id in self.get_vpc_subnets(vpc_data, region, field="id"):
|
|
@@ -522,7 +523,7 @@ class ClusterCleaner:
|
|
|
522
523
|
if e.code == 404:
|
|
523
524
|
print(("VPC doesn't exist."))
|
|
524
525
|
return None
|
|
525
|
-
else: raise
|
|
526
|
+
else: raise
|
|
526
527
|
|
|
527
528
|
def delete_subnets(vpc_data):
|
|
528
529
|
def _poll_subnet_exists(subnet_id):
|
|
@@ -560,12 +561,12 @@ class ClusterCleaner:
|
|
|
560
561
|
deleting_resource = False
|
|
561
562
|
except ibm_cloud_sdk_core.ApiException as e:
|
|
562
563
|
if e.code == 404:
|
|
563
|
-
print("gateway doesn't exist.")
|
|
564
|
+
print("gateway doesn't exist.")
|
|
564
565
|
deleting_resource = False
|
|
565
566
|
if e.code == 409:
|
|
566
567
|
print("gateway still in use.")
|
|
567
|
-
# will retry until cloud functions timeout.
|
|
568
|
-
time.sleep(5)
|
|
568
|
+
# will retry until cloud functions timeout.
|
|
569
|
+
time.sleep(5)
|
|
569
570
|
|
|
570
571
|
def delete_vms(vpc_id):
|
|
571
572
|
def _poll_vpc_contains_vms(vpc_id):
|
|
@@ -586,7 +587,7 @@ class ClusterCleaner:
|
|
|
586
587
|
)
|
|
587
588
|
|
|
588
589
|
def _del_instance(vm_data):
|
|
589
|
-
# first delete ips created by node_provider
|
|
590
|
+
# first delete ips created by node_provider
|
|
590
591
|
nic_id = vm_data["network_interfaces"][0]["id"]
|
|
591
592
|
res = ibm_vpc_client.list_instance_network_interface_floating_ips(
|
|
592
593
|
vm_data["id"], nic_id
|
|
@@ -598,7 +599,7 @@ class ClusterCleaner:
|
|
|
598
599
|
ibm_vpc_client.delete_floating_ip(ip["id"])
|
|
599
600
|
print(f"Deleting VM: {vm_data['id']}")
|
|
600
601
|
ibm_vpc_client.delete_instance(id=vm_data["id"])
|
|
601
|
-
|
|
602
|
+
|
|
602
603
|
res = ibm_vpc_client.list_instances(vpc_id=vpc_id).get_result()
|
|
603
604
|
num_instances = res["total_count"]
|
|
604
605
|
|
|
@@ -619,12 +620,12 @@ class ClusterCleaner:
|
|
|
619
620
|
deleting_resource = False
|
|
620
621
|
except ibm_cloud_sdk_core.ApiException as e:
|
|
621
622
|
if e.code == 404:
|
|
622
|
-
print("VPC doesn't exist.")
|
|
623
|
+
print("VPC doesn't exist.")
|
|
623
624
|
deleting_resource = False
|
|
624
625
|
if e.code == 409:
|
|
625
626
|
print("VPC still in use.")
|
|
626
|
-
# will retry until cloud functions timeout.
|
|
627
|
-
time.sleep(5)
|
|
627
|
+
# will retry until cloud functions timeout.
|
|
628
|
+
time.sleep(5)
|
|
628
629
|
|
|
629
630
|
def delete_vpc(vpc_id):
|
|
630
631
|
vpc_data = get_vpc_data(vpc_id)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Runtime utilities for SkyPilot."""
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from sky.skylet import constants
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_runtime_dir_path(path_suffix: str = '') -> str:
|
|
8
|
+
"""Get an expanded path within the SkyPilot runtime directory.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
path_suffix: Path suffix to join with the runtime dir
|
|
12
|
+
(e.g., '.sky/jobs.db').
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
The full expanded path.
|
|
16
|
+
"""
|
|
17
|
+
runtime_dir = os.path.expanduser(
|
|
18
|
+
os.environ.get(constants.SKY_RUNTIME_DIR_ENV_VAR_KEY, '~'))
|
|
19
|
+
if path_suffix:
|
|
20
|
+
return os.path.join(runtime_dir, path_suffix)
|
|
21
|
+
return runtime_dir
|