skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/job_lib.py
CHANGED
|
@@ -23,20 +23,22 @@ from sky import global_user_state
|
|
|
23
23
|
from sky import sky_logging
|
|
24
24
|
from sky.adaptors import common as adaptors_common
|
|
25
25
|
from sky.skylet import constants
|
|
26
|
+
from sky.skylet import runtime_utils
|
|
26
27
|
from sky.utils import common_utils
|
|
27
|
-
from sky.utils import log_utils
|
|
28
28
|
from sky.utils import message_utils
|
|
29
29
|
from sky.utils import subprocess_utils
|
|
30
30
|
from sky.utils.db import db_utils
|
|
31
31
|
|
|
32
32
|
if typing.TYPE_CHECKING:
|
|
33
33
|
import psutil
|
|
34
|
+
|
|
35
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
34
36
|
else:
|
|
35
37
|
psutil = adaptors_common.LazyImport('psutil')
|
|
38
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
36
39
|
|
|
37
40
|
logger = sky_logging.init_logger(__name__)
|
|
38
41
|
|
|
39
|
-
_LINUX_NEW_LINE = '\n'
|
|
40
42
|
_JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
|
|
41
43
|
# JOB_CMD_IDENTIFIER is used for identifying the process retrieved
|
|
42
44
|
# with pid is the same driver process to guard against the case where
|
|
@@ -82,13 +84,9 @@ def create_table(cursor, conn):
|
|
|
82
84
|
# is not critical and is likely to be enabled by other processes.
|
|
83
85
|
|
|
84
86
|
# Pid column is used for keeping track of the driver process of a job. It
|
|
85
|
-
# can be in
|
|
86
|
-
# -1: The job was submitted with SkyPilot older than #4318, where we use
|
|
87
|
-
# ray job submit to submit the job, i.e. no pid is recorded. This is for
|
|
88
|
-
# backward compatibility and should be removed after 0.10.0.
|
|
87
|
+
# can be in two states:
|
|
89
88
|
# 0: The job driver process has never been started. When adding a job with
|
|
90
|
-
# INIT state, the pid will be set to 0
|
|
91
|
-
# backward compatibility).
|
|
89
|
+
# INIT state, the pid will be set to 0.
|
|
92
90
|
# >=0: The job has been started. The pid is the driver process's pid.
|
|
93
91
|
# The driver can be actually running or finished.
|
|
94
92
|
# TODO(SKY-1213): username is actually user hash, should rename.
|
|
@@ -144,7 +142,7 @@ def init_db(func):
|
|
|
144
142
|
|
|
145
143
|
with _db_init_lock:
|
|
146
144
|
if _DB is None:
|
|
147
|
-
db_path =
|
|
145
|
+
db_path = runtime_utils.get_runtime_dir_path('.sky/jobs.db')
|
|
148
146
|
os.makedirs(pathlib.Path(db_path).parents[0], exist_ok=True)
|
|
149
147
|
_DB = db_utils.SQLiteConn(db_path, create_table)
|
|
150
148
|
return func(*args, **kwargs)
|
|
@@ -220,6 +218,45 @@ class JobStatus(enum.Enum):
|
|
|
220
218
|
color = _JOB_STATUS_TO_COLOR[self]
|
|
221
219
|
return f'{color}{self.value}{colorama.Style.RESET_ALL}'
|
|
222
220
|
|
|
221
|
+
@classmethod
|
|
222
|
+
def from_protobuf(
|
|
223
|
+
cls,
|
|
224
|
+
protobuf_value: 'jobsv1_pb2.JobStatus') -> Optional['JobStatus']:
|
|
225
|
+
"""Convert protobuf JobStatus enum to Python enum value."""
|
|
226
|
+
protobuf_to_enum = {
|
|
227
|
+
jobsv1_pb2.JOB_STATUS_INIT: cls.INIT,
|
|
228
|
+
jobsv1_pb2.JOB_STATUS_PENDING: cls.PENDING,
|
|
229
|
+
jobsv1_pb2.JOB_STATUS_SETTING_UP: cls.SETTING_UP,
|
|
230
|
+
jobsv1_pb2.JOB_STATUS_RUNNING: cls.RUNNING,
|
|
231
|
+
jobsv1_pb2.JOB_STATUS_FAILED_DRIVER: cls.FAILED_DRIVER,
|
|
232
|
+
jobsv1_pb2.JOB_STATUS_SUCCEEDED: cls.SUCCEEDED,
|
|
233
|
+
jobsv1_pb2.JOB_STATUS_FAILED: cls.FAILED,
|
|
234
|
+
jobsv1_pb2.JOB_STATUS_FAILED_SETUP: cls.FAILED_SETUP,
|
|
235
|
+
jobsv1_pb2.JOB_STATUS_CANCELLED: cls.CANCELLED,
|
|
236
|
+
jobsv1_pb2.JOB_STATUS_UNSPECIFIED: None,
|
|
237
|
+
}
|
|
238
|
+
if protobuf_value not in protobuf_to_enum:
|
|
239
|
+
raise ValueError(
|
|
240
|
+
f'Unknown protobuf JobStatus value: {protobuf_value}')
|
|
241
|
+
return protobuf_to_enum[protobuf_value]
|
|
242
|
+
|
|
243
|
+
def to_protobuf(self) -> 'jobsv1_pb2.JobStatus':
|
|
244
|
+
"""Convert this Python enum value to protobuf enum value."""
|
|
245
|
+
enum_to_protobuf = {
|
|
246
|
+
JobStatus.INIT: jobsv1_pb2.JOB_STATUS_INIT,
|
|
247
|
+
JobStatus.PENDING: jobsv1_pb2.JOB_STATUS_PENDING,
|
|
248
|
+
JobStatus.SETTING_UP: jobsv1_pb2.JOB_STATUS_SETTING_UP,
|
|
249
|
+
JobStatus.RUNNING: jobsv1_pb2.JOB_STATUS_RUNNING,
|
|
250
|
+
JobStatus.FAILED_DRIVER: jobsv1_pb2.JOB_STATUS_FAILED_DRIVER,
|
|
251
|
+
JobStatus.SUCCEEDED: jobsv1_pb2.JOB_STATUS_SUCCEEDED,
|
|
252
|
+
JobStatus.FAILED: jobsv1_pb2.JOB_STATUS_FAILED,
|
|
253
|
+
JobStatus.FAILED_SETUP: jobsv1_pb2.JOB_STATUS_FAILED_SETUP,
|
|
254
|
+
JobStatus.CANCELLED: jobsv1_pb2.JOB_STATUS_CANCELLED,
|
|
255
|
+
}
|
|
256
|
+
if self not in enum_to_protobuf:
|
|
257
|
+
raise ValueError(f'Unknown JobStatus value: {self}')
|
|
258
|
+
return enum_to_protobuf[self]
|
|
259
|
+
|
|
223
260
|
|
|
224
261
|
# We have two steps for job submissions:
|
|
225
262
|
# 1. Client reserve a job id from the job table by adding a INIT state job.
|
|
@@ -261,11 +298,7 @@ class JobScheduler:
|
|
|
261
298
|
f'WHERE job_id={job_id!r}'))
|
|
262
299
|
_DB.conn.commit()
|
|
263
300
|
pid = subprocess_utils.launch_new_process_tree(run_cmd)
|
|
264
|
-
|
|
265
|
-
# This is for the case where the job is submitted with SkyPilot older
|
|
266
|
-
# than #4318, using ray job submit.
|
|
267
|
-
if 'job submit' in run_cmd:
|
|
268
|
-
pid = -1
|
|
301
|
+
|
|
269
302
|
_DB.cursor.execute((f'UPDATE jobs SET pid={pid} '
|
|
270
303
|
f'WHERE job_id={job_id!r}'))
|
|
271
304
|
_DB.conn.commit()
|
|
@@ -475,6 +508,11 @@ def get_status(job_id: int) -> Optional[JobStatus]:
|
|
|
475
508
|
|
|
476
509
|
@init_db
|
|
477
510
|
def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
511
|
+
return message_utils.encode_payload(get_statuses(job_ids))
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
@init_db
|
|
515
|
+
def get_statuses(job_ids: List[int]) -> Dict[int, Optional[str]]:
|
|
478
516
|
assert _DB is not None
|
|
479
517
|
# Per-job lock is not required here, since the staled job status will not
|
|
480
518
|
# affect the caller.
|
|
@@ -482,10 +520,51 @@ def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
|
482
520
|
rows = _DB.cursor.execute(
|
|
483
521
|
f'SELECT job_id, status FROM jobs WHERE job_id IN ({query_str})',
|
|
484
522
|
job_ids)
|
|
485
|
-
statuses = {job_id: None for job_id in job_ids}
|
|
523
|
+
statuses: Dict[int, Optional[str]] = {job_id: None for job_id in job_ids}
|
|
486
524
|
for (job_id, status) in rows:
|
|
487
525
|
statuses[job_id] = status
|
|
488
|
-
return
|
|
526
|
+
return statuses
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
@init_db
|
|
530
|
+
def get_jobs_info(user_hash: Optional[str] = None,
|
|
531
|
+
all_jobs: bool = False) -> List['jobsv1_pb2.JobInfo']:
|
|
532
|
+
"""Get detailed job information.
|
|
533
|
+
|
|
534
|
+
Similar to dump_job_queue but returns structured protobuf objects instead
|
|
535
|
+
of encoded strings.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
user_hash: The user hash to show jobs for. Show all the users if None.
|
|
539
|
+
all_jobs: Whether to show all jobs, not just the pending/running ones.
|
|
540
|
+
"""
|
|
541
|
+
assert _DB is not None
|
|
542
|
+
|
|
543
|
+
status_list: Optional[List[JobStatus]] = [
|
|
544
|
+
JobStatus.SETTING_UP, JobStatus.PENDING, JobStatus.RUNNING
|
|
545
|
+
]
|
|
546
|
+
if all_jobs:
|
|
547
|
+
status_list = None
|
|
548
|
+
|
|
549
|
+
jobs = _get_jobs(user_hash, status_list=status_list)
|
|
550
|
+
jobs_info = []
|
|
551
|
+
for job in jobs:
|
|
552
|
+
jobs_info.append(
|
|
553
|
+
jobsv1_pb2.JobInfo(job_id=job['job_id'],
|
|
554
|
+
job_name=job['job_name'],
|
|
555
|
+
username=job['username'],
|
|
556
|
+
submitted_at=job['submitted_at'],
|
|
557
|
+
status=job['status'].to_protobuf(),
|
|
558
|
+
run_timestamp=job['run_timestamp'],
|
|
559
|
+
start_at=job['start_at'],
|
|
560
|
+
end_at=job['end_at'],
|
|
561
|
+
resources=job['resources'],
|
|
562
|
+
pid=job['pid'],
|
|
563
|
+
log_path=os.path.join(
|
|
564
|
+
constants.SKY_LOGS_DIRECTORY,
|
|
565
|
+
job['run_timestamp']),
|
|
566
|
+
metadata=json.dumps(job['metadata'])))
|
|
567
|
+
return jobs_info
|
|
489
568
|
|
|
490
569
|
|
|
491
570
|
def load_statuses_payload(
|
|
@@ -524,16 +603,27 @@ def get_job_submitted_or_ended_timestamp_payload(job_id: int,
|
|
|
524
603
|
PENDING state.
|
|
525
604
|
|
|
526
605
|
The normal job duration will use `start_at` instead of `submitted_at` (in
|
|
527
|
-
`format_job_queue()`), because the job may stay in PENDING if
|
|
528
|
-
busy.
|
|
606
|
+
`table_utils.format_job_queue()`), because the job may stay in PENDING if
|
|
607
|
+
the cluster is busy.
|
|
608
|
+
"""
|
|
609
|
+
return message_utils.encode_payload(
|
|
610
|
+
get_job_submitted_or_ended_timestamp(job_id, get_ended_time))
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
@init_db
|
|
614
|
+
def get_job_submitted_or_ended_timestamp(
|
|
615
|
+
job_id: int, get_ended_time: bool) -> Optional[float]:
|
|
616
|
+
"""Get the job submitted timestamp.
|
|
617
|
+
|
|
618
|
+
Returns the raw timestamp or None if job doesn't exist.
|
|
529
619
|
"""
|
|
530
620
|
assert _DB is not None
|
|
531
621
|
field = 'end_at' if get_ended_time else 'submitted_at'
|
|
532
622
|
rows = _DB.cursor.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
|
|
533
623
|
(job_id,))
|
|
534
624
|
for (timestamp,) in rows:
|
|
535
|
-
return
|
|
536
|
-
return
|
|
625
|
+
return timestamp
|
|
626
|
+
return None
|
|
537
627
|
|
|
538
628
|
|
|
539
629
|
def get_ray_port():
|
|
@@ -542,7 +632,8 @@ def get_ray_port():
|
|
|
542
632
|
If the port file does not exist, the cluster was launched before #1790,
|
|
543
633
|
return the default port.
|
|
544
634
|
"""
|
|
545
|
-
port_path =
|
|
635
|
+
port_path = runtime_utils.get_runtime_dir_path(
|
|
636
|
+
constants.SKY_REMOTE_RAY_PORT_FILE)
|
|
546
637
|
if not os.path.exists(port_path):
|
|
547
638
|
return 6379
|
|
548
639
|
port = json.load(open(port_path, 'r', encoding='utf-8'))['ray_port']
|
|
@@ -555,7 +646,8 @@ def get_job_submission_port():
|
|
|
555
646
|
If the port file does not exist, the cluster was launched before #1790,
|
|
556
647
|
return the default port.
|
|
557
648
|
"""
|
|
558
|
-
port_path =
|
|
649
|
+
port_path = runtime_utils.get_runtime_dir_path(
|
|
650
|
+
constants.SKY_REMOTE_RAY_PORT_FILE)
|
|
559
651
|
if not os.path.exists(port_path):
|
|
560
652
|
return 8265
|
|
561
653
|
port = json.load(open(port_path, 'r',
|
|
@@ -673,7 +765,7 @@ def update_job_status(job_ids: List[int],
|
|
|
673
765
|
statuses = []
|
|
674
766
|
for job_id in job_ids:
|
|
675
767
|
# Per-job status lock is required because between the job status
|
|
676
|
-
# query and the job status update, the job status in the
|
|
768
|
+
# query and the job status update, the job status in the database
|
|
677
769
|
# can be modified by the generated ray program.
|
|
678
770
|
with filelock.FileLock(_get_lock_path(job_id)):
|
|
679
771
|
status = None
|
|
@@ -724,12 +816,6 @@ def update_job_status(job_ids: List[int],
|
|
|
724
816
|
'the job state is not in terminal states, setting '
|
|
725
817
|
'it to FAILED_DRIVER')
|
|
726
818
|
status = JobStatus.FAILED_DRIVER
|
|
727
|
-
elif job_pid < 0:
|
|
728
|
-
# TODO(zhwu): Backward compatibility, remove after 0.10.0.
|
|
729
|
-
# We set the job status to PENDING instead of actually
|
|
730
|
-
# checking ray job status and let the status in job table
|
|
731
|
-
# take effect in the later max.
|
|
732
|
-
status = JobStatus.PENDING
|
|
733
819
|
|
|
734
820
|
pending_job = _get_pending_job(job_id)
|
|
735
821
|
if pending_job is not None:
|
|
@@ -842,35 +928,6 @@ def is_cluster_idle() -> bool:
|
|
|
842
928
|
assert False, 'Should not reach here'
|
|
843
929
|
|
|
844
930
|
|
|
845
|
-
def format_job_queue(jobs: List[Dict[str, Any]]):
|
|
846
|
-
"""Format the job queue for display.
|
|
847
|
-
|
|
848
|
-
Usage:
|
|
849
|
-
jobs = get_job_queue()
|
|
850
|
-
print(format_job_queue(jobs))
|
|
851
|
-
"""
|
|
852
|
-
job_table = log_utils.create_table([
|
|
853
|
-
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
|
854
|
-
'STATUS', 'LOG', 'GIT COMMIT'
|
|
855
|
-
])
|
|
856
|
-
for job in jobs:
|
|
857
|
-
job_table.add_row([
|
|
858
|
-
job['job_id'],
|
|
859
|
-
job['job_name'],
|
|
860
|
-
job['username'],
|
|
861
|
-
log_utils.readable_time_duration(job['submitted_at']),
|
|
862
|
-
log_utils.readable_time_duration(job['start_at']),
|
|
863
|
-
log_utils.readable_time_duration(job['start_at'],
|
|
864
|
-
job['end_at'],
|
|
865
|
-
absolute=True),
|
|
866
|
-
job['resources'],
|
|
867
|
-
job['status'].colored_str(),
|
|
868
|
-
job['log_path'],
|
|
869
|
-
job.get('metadata', {}).get('git_commit', '-'),
|
|
870
|
-
])
|
|
871
|
-
return job_table
|
|
872
|
-
|
|
873
|
-
|
|
874
931
|
def dump_job_queue(user_hash: Optional[str], all_jobs: bool) -> str:
|
|
875
932
|
"""Get the job queue in encoded json format.
|
|
876
933
|
|
|
@@ -907,27 +964,6 @@ def load_job_queue(payload: str) -> List[Dict[str, Any]]:
|
|
|
907
964
|
return jobs
|
|
908
965
|
|
|
909
966
|
|
|
910
|
-
# TODO(zhwu): Backward compatibility for jobs submitted before #4318, remove
|
|
911
|
-
# after 0.10.0.
|
|
912
|
-
def _create_ray_job_submission_client():
|
|
913
|
-
"""Import the ray job submission client."""
|
|
914
|
-
try:
|
|
915
|
-
import ray # pylint: disable=import-outside-toplevel
|
|
916
|
-
except ImportError:
|
|
917
|
-
logger.error('Failed to import ray')
|
|
918
|
-
raise
|
|
919
|
-
try:
|
|
920
|
-
# pylint: disable=import-outside-toplevel
|
|
921
|
-
from ray import job_submission
|
|
922
|
-
except ImportError:
|
|
923
|
-
logger.error(
|
|
924
|
-
f'Failed to import job_submission with ray=={ray.__version__}')
|
|
925
|
-
raise
|
|
926
|
-
port = get_job_submission_port()
|
|
927
|
-
return job_submission.JobSubmissionClient(
|
|
928
|
-
address=f'http://127.0.0.1:{port}')
|
|
929
|
-
|
|
930
|
-
|
|
931
967
|
def _make_ray_job_id(sky_job_id: int) -> str:
|
|
932
968
|
return f'{sky_job_id}-{getpass.getuser()}'
|
|
933
969
|
|
|
@@ -947,6 +983,13 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
947
983
|
Encoded job IDs that are actually cancelled. Caller should use
|
|
948
984
|
message_utils.decode_payload() to parse.
|
|
949
985
|
"""
|
|
986
|
+
return message_utils.encode_payload(cancel_jobs(jobs, cancel_all,
|
|
987
|
+
user_hash))
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
def cancel_jobs(jobs: Optional[List[int]],
|
|
991
|
+
cancel_all: bool = False,
|
|
992
|
+
user_hash: Optional[str] = None) -> List[int]:
|
|
950
993
|
job_records = []
|
|
951
994
|
all_status = [JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING]
|
|
952
995
|
if jobs is None and not cancel_all:
|
|
@@ -989,18 +1032,6 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
989
1032
|
# We don't have to start a daemon to forcefully kill the process
|
|
990
1033
|
# as our job driver process will clean up the underlying
|
|
991
1034
|
# child processes.
|
|
992
|
-
elif job['pid'] < 0:
|
|
993
|
-
try:
|
|
994
|
-
# TODO(zhwu): Backward compatibility, remove after 0.10.0.
|
|
995
|
-
# The job was submitted with ray job submit before #4318.
|
|
996
|
-
job_client = _create_ray_job_submission_client()
|
|
997
|
-
job_client.stop_job(_make_ray_job_id(job['job_id']))
|
|
998
|
-
except RuntimeError as e:
|
|
999
|
-
# If the request to the job server fails, we should not
|
|
1000
|
-
# set the job to CANCELLED.
|
|
1001
|
-
if 'does not exist' not in str(e):
|
|
1002
|
-
logger.warning(str(e))
|
|
1003
|
-
continue
|
|
1004
1035
|
# Get the job status again to avoid race condition.
|
|
1005
1036
|
job_status = get_status_no_lock(job['job_id'])
|
|
1006
1037
|
if job_status in [
|
|
@@ -1010,7 +1041,7 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
|
|
|
1010
1041
|
cancelled_ids.append(job['job_id'])
|
|
1011
1042
|
|
|
1012
1043
|
scheduler.schedule_step()
|
|
1013
|
-
return
|
|
1044
|
+
return cancelled_ids
|
|
1014
1045
|
|
|
1015
1046
|
|
|
1016
1047
|
@init_db
|
|
@@ -1030,6 +1061,17 @@ def get_run_timestamp(job_id: Optional[int]) -> Optional[str]:
|
|
|
1030
1061
|
|
|
1031
1062
|
@init_db
|
|
1032
1063
|
def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
1064
|
+
"""Returns the relative paths to the log files for jobs with globbing,
|
|
1065
|
+
encoded."""
|
|
1066
|
+
job_to_dir = get_job_log_dirs(job_ids)
|
|
1067
|
+
job_to_dir_str: Dict[str, str] = {}
|
|
1068
|
+
for job_id, log_dir in job_to_dir.items():
|
|
1069
|
+
job_to_dir_str[str(job_id)] = log_dir
|
|
1070
|
+
return message_utils.encode_payload(job_to_dir_str)
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
@init_db
|
|
1074
|
+
def get_job_log_dirs(job_ids: List[int]) -> Dict[int, str]:
|
|
1033
1075
|
"""Returns the relative paths to the log files for jobs with globbing."""
|
|
1034
1076
|
assert _DB is not None
|
|
1035
1077
|
query_str = ' OR '.join(['job_id GLOB (?)'] * len(job_ids))
|
|
@@ -1038,16 +1080,16 @@ def get_log_dir_for_jobs(job_ids: List[Optional[str]]) -> str:
|
|
|
1038
1080
|
SELECT * FROM jobs
|
|
1039
1081
|
WHERE {query_str}""", job_ids)
|
|
1040
1082
|
rows = _DB.cursor.fetchall()
|
|
1041
|
-
job_to_dir = {}
|
|
1083
|
+
job_to_dir: Dict[int, str] = {}
|
|
1042
1084
|
for row in rows:
|
|
1043
1085
|
job_id = row[JobInfoLoc.JOB_ID.value]
|
|
1044
1086
|
if row[JobInfoLoc.LOG_PATH.value]:
|
|
1045
|
-
job_to_dir[
|
|
1087
|
+
job_to_dir[job_id] = row[JobInfoLoc.LOG_PATH.value]
|
|
1046
1088
|
else:
|
|
1047
1089
|
run_timestamp = row[JobInfoLoc.RUN_TIMESTAMP.value]
|
|
1048
|
-
job_to_dir[
|
|
1049
|
-
|
|
1050
|
-
return
|
|
1090
|
+
job_to_dir[job_id] = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
1091
|
+
run_timestamp)
|
|
1092
|
+
return job_to_dir
|
|
1051
1093
|
|
|
1052
1094
|
|
|
1053
1095
|
class JobLibCodeGen:
|
|
@@ -1176,15 +1218,10 @@ class JobLibCodeGen:
|
|
|
1176
1218
|
f' log_dir = None if run_timestamp is None else os.path.join({constants.SKY_LOGS_DIRECTORY!r}, run_timestamp)'
|
|
1177
1219
|
),
|
|
1178
1220
|
# Add a newline to leave the if indent block above.
|
|
1179
|
-
f'\
|
|
1180
|
-
f'{_LINUX_NEW_LINE}if getattr(constants, "SKYLET_LIB_VERSION", 1) > 1: tail_log_kwargs["tail"] = {tail}',
|
|
1181
|
-
f'{_LINUX_NEW_LINE}log_lib.tail_logs(**tail_log_kwargs)',
|
|
1221
|
+
f'\nlog_lib.tail_logs(job_id=job_id, log_dir=log_dir, managed_job_id={managed_job_id!r}, follow={follow}, tail={tail})',
|
|
1182
1222
|
# After tailing, check the job status and exit with appropriate code
|
|
1183
1223
|
'job_status = job_lib.get_status(job_id)',
|
|
1184
|
-
|
|
1185
|
-
# and older did not have JobExitCode, so we use 0 for those versions
|
|
1186
|
-
# TODO: Remove this special handling after 0.10.0.
|
|
1187
|
-
'exit_code = exceptions.JobExitCode.from_job_status(job_status) if getattr(constants, "SKYLET_LIB_VERSION", 1) > 2 else 0',
|
|
1224
|
+
'exit_code = exceptions.JobExitCode.from_job_status(job_status)',
|
|
1188
1225
|
# Fix for dashboard: When follow=False and job is still running (NOT_FINISHED=101),
|
|
1189
1226
|
# exit with success (0) since fetching current logs is a successful operation.
|
|
1190
1227
|
# This prevents shell wrappers from printing "command terminated with exit code 101".
|
sky/skylet/log_lib.py
CHANGED
|
@@ -8,11 +8,13 @@ import functools
|
|
|
8
8
|
import io
|
|
9
9
|
import multiprocessing.pool
|
|
10
10
|
import os
|
|
11
|
+
import queue as queue_lib
|
|
11
12
|
import shlex
|
|
12
13
|
import subprocess
|
|
13
14
|
import sys
|
|
14
15
|
import tempfile
|
|
15
16
|
import textwrap
|
|
17
|
+
import threading
|
|
16
18
|
import time
|
|
17
19
|
from typing import (Deque, Dict, Iterable, Iterator, List, Optional, TextIO,
|
|
18
20
|
Tuple, Union)
|
|
@@ -39,6 +41,11 @@ logger = sky_logging.init_logger(__name__)
|
|
|
39
41
|
|
|
40
42
|
LOG_FILE_START_STREAMING_AT = 'Waiting for task resources on '
|
|
41
43
|
|
|
44
|
+
# 16-64KiB seems to be the sweet spot:
|
|
45
|
+
# https://github.com/grpc/grpc.github.io/issues/371
|
|
46
|
+
# TODO(kevin): Benchmark this ourselves and verify.
|
|
47
|
+
DEFAULT_LOG_CHUNK_SIZE = 16 * 1024 # 16KiB
|
|
48
|
+
|
|
42
49
|
|
|
43
50
|
class _ProcessingArgs:
|
|
44
51
|
"""Arguments for processing logs."""
|
|
@@ -213,7 +220,14 @@ def run_with_log(
|
|
|
213
220
|
stdin=stdin,
|
|
214
221
|
**kwargs) as proc:
|
|
215
222
|
try:
|
|
216
|
-
|
|
223
|
+
if ctx is not None:
|
|
224
|
+
# When runs in coroutine, use kill_pg if available to avoid
|
|
225
|
+
# the overhead of refreshing the process tree in the daemon.
|
|
226
|
+
subprocess_utils.kill_process_daemon(proc.pid, use_kill_pg=True)
|
|
227
|
+
else:
|
|
228
|
+
# For backward compatibility, do not specify use_kill_pg by
|
|
229
|
+
# default.
|
|
230
|
+
subprocess_utils.kill_process_daemon(proc.pid)
|
|
217
231
|
stdout = ''
|
|
218
232
|
stderr = ''
|
|
219
233
|
stdout_stream_handler = None
|
|
@@ -264,7 +278,6 @@ def run_with_log(
|
|
|
264
278
|
stdout, stderr = context_utils.pipe_and_wait_process(
|
|
265
279
|
ctx,
|
|
266
280
|
proc,
|
|
267
|
-
cancel_callback=subprocess_utils.kill_children_processes,
|
|
268
281
|
stdout_stream_handler=stdout_stream_handler,
|
|
269
282
|
stderr_stream_handler=stderr_stream_handler)
|
|
270
283
|
elif process_stream:
|
|
@@ -354,6 +367,17 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
354
367
|
shell=True)
|
|
355
368
|
|
|
356
369
|
|
|
370
|
+
def run_bash_command_with_log_and_return_pid(
|
|
371
|
+
bash_command: str,
|
|
372
|
+
log_path: str,
|
|
373
|
+
env_vars: Optional[Dict[str, str]] = None,
|
|
374
|
+
stream_logs: bool = False,
|
|
375
|
+
with_ray: bool = False):
|
|
376
|
+
return_code = run_bash_command_with_log(bash_command, log_path, env_vars,
|
|
377
|
+
stream_logs, with_ray)
|
|
378
|
+
return {'return_code': return_code, 'pid': os.getpid()}
|
|
379
|
+
|
|
380
|
+
|
|
357
381
|
def _follow_job_logs(file,
|
|
358
382
|
job_id: int,
|
|
359
383
|
start_streaming: bool,
|
|
@@ -395,9 +419,9 @@ def _follow_job_logs(file,
|
|
|
395
419
|
wait_last_logs = False
|
|
396
420
|
continue
|
|
397
421
|
status_str = status.value if status is not None else 'None'
|
|
398
|
-
|
|
399
|
-
f'Job finished (status: {status_str}).')
|
|
400
|
-
|
|
422
|
+
finish = ux_utils.finishing_message(
|
|
423
|
+
f'Job finished (status: {status_str}).')
|
|
424
|
+
yield finish + '\n'
|
|
401
425
|
return
|
|
402
426
|
|
|
403
427
|
time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
|
|
@@ -552,3 +576,207 @@ def tail_logs(job_id: Optional[int],
|
|
|
552
576
|
except FileNotFoundError:
|
|
553
577
|
print(f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
554
578
|
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def tail_logs_iter(job_id: Optional[int],
|
|
582
|
+
log_dir: Optional[str],
|
|
583
|
+
managed_job_id: Optional[int] = None,
|
|
584
|
+
follow: bool = True,
|
|
585
|
+
tail: int = 0) -> Iterator[str]:
|
|
586
|
+
"""Tail the logs of a job. This is mostly the same as tail_logs, but
|
|
587
|
+
returns an iterator instead of printing to stdout/stderr."""
|
|
588
|
+
if job_id is None:
|
|
589
|
+
# This only happens when job_lib.get_latest_job_id() returns None,
|
|
590
|
+
# which means no job has been submitted to this cluster. See
|
|
591
|
+
# sky.skylet.job_lib.JobLibCodeGen.tail_logs for more details.
|
|
592
|
+
logger.info('Skip streaming logs as no job has been submitted.')
|
|
593
|
+
return
|
|
594
|
+
job_str = f'job {job_id}'
|
|
595
|
+
if managed_job_id is not None:
|
|
596
|
+
job_str = f'managed job {managed_job_id}'
|
|
597
|
+
if log_dir is None:
|
|
598
|
+
msg = f'{job_str.capitalize()} not found (see `sky queue`).'
|
|
599
|
+
yield msg + '\n'
|
|
600
|
+
return
|
|
601
|
+
logger.debug(f'Tailing logs for job, real job_id {job_id}, managed_job_id '
|
|
602
|
+
f'{managed_job_id}.')
|
|
603
|
+
log_path = os.path.join(log_dir, 'run.log')
|
|
604
|
+
log_path = os.path.expanduser(log_path)
|
|
605
|
+
|
|
606
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
607
|
+
|
|
608
|
+
# Wait for the log to be written. This is needed due to the `ray submit`
|
|
609
|
+
# will take some time to start the job and write the log.
|
|
610
|
+
retry_cnt = 0
|
|
611
|
+
while status is not None and not status.is_terminal():
|
|
612
|
+
retry_cnt += 1
|
|
613
|
+
if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
|
|
614
|
+
break
|
|
615
|
+
if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
|
|
616
|
+
err = (f'{colorama.Fore.RED}ERROR: Logs for '
|
|
617
|
+
f'{job_str} (status: {status.value}) does not exist '
|
|
618
|
+
f'after retrying {retry_cnt} times.'
|
|
619
|
+
f'{colorama.Style.RESET_ALL}')
|
|
620
|
+
yield err + '\n'
|
|
621
|
+
return
|
|
622
|
+
waiting = (f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
|
|
623
|
+
'to be written...')
|
|
624
|
+
yield waiting + '\n'
|
|
625
|
+
time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
|
|
626
|
+
status = job_lib.update_job_status([job_id], silent=True)[0]
|
|
627
|
+
|
|
628
|
+
start_stream_at = LOG_FILE_START_STREAMING_AT
|
|
629
|
+
# Explicitly declare the type to avoid mypy warning.
|
|
630
|
+
lines: Iterable[str] = []
|
|
631
|
+
if follow and status in [
|
|
632
|
+
job_lib.JobStatus.SETTING_UP,
|
|
633
|
+
job_lib.JobStatus.PENDING,
|
|
634
|
+
job_lib.JobStatus.RUNNING,
|
|
635
|
+
]:
|
|
636
|
+
# Not using `ray job logs` because it will put progress bar in
|
|
637
|
+
# multiple lines.
|
|
638
|
+
with open(log_path, 'r', newline='', encoding='utf-8') as log_file:
|
|
639
|
+
# Using `_follow` instead of `tail -f` to streaming the whole
|
|
640
|
+
# log and creating a new process for tail.
|
|
641
|
+
start_streaming = False
|
|
642
|
+
if tail > 0:
|
|
643
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
644
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
645
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
646
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
647
|
+
for line in lines:
|
|
648
|
+
if start_stream_at in line:
|
|
649
|
+
start_streaming = True
|
|
650
|
+
if start_streaming:
|
|
651
|
+
yield line
|
|
652
|
+
# Now, the cursor is at the end of the last lines
|
|
653
|
+
# if tail > 0
|
|
654
|
+
for line in _follow_job_logs(log_file,
|
|
655
|
+
job_id=job_id,
|
|
656
|
+
start_streaming=start_streaming,
|
|
657
|
+
start_streaming_at=start_stream_at):
|
|
658
|
+
yield line
|
|
659
|
+
else:
|
|
660
|
+
try:
|
|
661
|
+
start_streaming = False
|
|
662
|
+
with open(log_path, 'r', encoding='utf-8') as log_file:
|
|
663
|
+
if tail > 0:
|
|
664
|
+
# If tail > 0, we need to read the last n lines.
|
|
665
|
+
# We use double ended queue to rotate the last n lines.
|
|
666
|
+
head_lines_of_log_file = _peek_head_lines(log_file)
|
|
667
|
+
lines = collections.deque(log_file, maxlen=tail)
|
|
668
|
+
start_streaming = _should_stream_the_whole_tail_lines(
|
|
669
|
+
head_lines_of_log_file, lines, start_stream_at)
|
|
670
|
+
else:
|
|
671
|
+
lines = log_file
|
|
672
|
+
for line in lines:
|
|
673
|
+
if start_stream_at in line:
|
|
674
|
+
start_streaming = True
|
|
675
|
+
if start_streaming:
|
|
676
|
+
yield line
|
|
677
|
+
status_str = status.value if status is not None else 'None'
|
|
678
|
+
# Only show "Job finished" for actually terminal states
|
|
679
|
+
if status is not None and status.is_terminal():
|
|
680
|
+
finish = ux_utils.finishing_message(
|
|
681
|
+
f'Job finished (status: {status_str}).')
|
|
682
|
+
yield finish + '\n'
|
|
683
|
+
return
|
|
684
|
+
except FileNotFoundError:
|
|
685
|
+
err = (
|
|
686
|
+
f'{colorama.Fore.RED}ERROR: Logs for job {job_id} (status:'
|
|
687
|
+
f' {status.value}) does not exist.{colorama.Style.RESET_ALL}')
|
|
688
|
+
yield err + '\n'
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
class LogBuffer:
|
|
692
|
+
"""In-memory buffer for chunking log lines for streaming."""
|
|
693
|
+
|
|
694
|
+
def __init__(self, max_chars: int = DEFAULT_LOG_CHUNK_SIZE):
|
|
695
|
+
"""Initialize the log buffer.
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
max_chars: Maximum buffer size (in characters, not bytes) before
|
|
699
|
+
flushing. The actual amount of bytes (UTF-8 encoding)
|
|
700
|
+
could be more than this, depending on the characters,
|
|
701
|
+
i.e. ASCII characters take 1 byte, while others
|
|
702
|
+
may take 2-4 bytes. But this is fine as our default
|
|
703
|
+
chunk size is well below the default value of
|
|
704
|
+
grpc.max_receive_message_length which is 4MB.
|
|
705
|
+
"""
|
|
706
|
+
self.max_chars = max_chars
|
|
707
|
+
self._buffer = io.StringIO()
|
|
708
|
+
|
|
709
|
+
def _should_flush(self) -> bool:
|
|
710
|
+
return self._buffer.tell() >= self.max_chars
|
|
711
|
+
|
|
712
|
+
def flush(self) -> str:
|
|
713
|
+
"""Get the current buffered content and clear the buffer.
|
|
714
|
+
|
|
715
|
+
Returns:
|
|
716
|
+
The buffered log lines as a single string
|
|
717
|
+
"""
|
|
718
|
+
if not self._buffer.tell():
|
|
719
|
+
return ''
|
|
720
|
+
chunk = self._buffer.getvalue()
|
|
721
|
+
self._buffer.truncate(0)
|
|
722
|
+
self._buffer.seek(0)
|
|
723
|
+
return chunk
|
|
724
|
+
|
|
725
|
+
def write(self, line: str) -> bool:
|
|
726
|
+
"""Add a line to the buffer.
|
|
727
|
+
|
|
728
|
+
Args:
|
|
729
|
+
line: The log line to add
|
|
730
|
+
|
|
731
|
+
Returns:
|
|
732
|
+
True if buffer should be flushed after adding the line
|
|
733
|
+
"""
|
|
734
|
+
self._buffer.write(line)
|
|
735
|
+
return self._should_flush()
|
|
736
|
+
|
|
737
|
+
def close(self):
|
|
738
|
+
self._buffer.close()
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def buffered_iter_with_timeout(buffer: LogBuffer, iterable: Iterable[str],
|
|
742
|
+
timeout: float) -> Iterable[str]:
|
|
743
|
+
"""Iterates over an iterable, writing each item to a buffer,
|
|
744
|
+
and flushing the buffer when it is full or no item is
|
|
745
|
+
yielded within the timeout duration."""
|
|
746
|
+
# TODO(kevin): Simplify this using asyncio.timeout, once we move
|
|
747
|
+
# the skylet event loop and gRPC server to asyncio.
|
|
748
|
+
# https://docs.python.org/3/library/asyncio-task.html#timeouts
|
|
749
|
+
|
|
750
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
751
|
+
sentinel = object()
|
|
752
|
+
|
|
753
|
+
def producer():
|
|
754
|
+
try:
|
|
755
|
+
for item in iterable:
|
|
756
|
+
queue.put(item)
|
|
757
|
+
finally:
|
|
758
|
+
queue.put(sentinel)
|
|
759
|
+
|
|
760
|
+
thread = threading.Thread(target=producer, daemon=True)
|
|
761
|
+
thread.start()
|
|
762
|
+
|
|
763
|
+
while True:
|
|
764
|
+
try:
|
|
765
|
+
item = queue.get(timeout=timeout)
|
|
766
|
+
except queue_lib.Empty:
|
|
767
|
+
out = buffer.flush()
|
|
768
|
+
if out:
|
|
769
|
+
yield out
|
|
770
|
+
continue
|
|
771
|
+
|
|
772
|
+
if item is sentinel:
|
|
773
|
+
thread.join()
|
|
774
|
+
out = buffer.flush()
|
|
775
|
+
if out:
|
|
776
|
+
yield out
|
|
777
|
+
return
|
|
778
|
+
|
|
779
|
+
if buffer.write(item):
|
|
780
|
+
out = buffer.flush()
|
|
781
|
+
if out:
|
|
782
|
+
yield out
|