dstack 0.0.9__py3-none-any.whl → 0.20.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/__init__.py +80 -0
- dstack/_internal/cli/commands/apply.py +100 -0
- dstack/_internal/cli/commands/attach.py +161 -0
- dstack/_internal/cli/commands/completion.py +22 -0
- dstack/_internal/cli/commands/delete.py +44 -0
- dstack/_internal/cli/commands/event.py +168 -0
- dstack/_internal/cli/commands/fleet.py +161 -0
- dstack/_internal/cli/commands/gateway.py +159 -0
- dstack/_internal/cli/commands/init.py +64 -0
- dstack/_internal/cli/commands/login.py +352 -0
- dstack/_internal/cli/commands/logs.py +62 -0
- dstack/_internal/cli/commands/metrics.py +153 -0
- dstack/_internal/cli/commands/offer.py +146 -0
- dstack/_internal/cli/commands/project.py +259 -0
- dstack/_internal/cli/commands/ps.py +81 -0
- dstack/_internal/cli/commands/run.py +69 -0
- dstack/_internal/cli/commands/secrets.py +92 -0
- dstack/_internal/cli/commands/server.py +96 -0
- dstack/_internal/cli/commands/stop.py +26 -0
- dstack/_internal/cli/commands/volume.py +117 -0
- dstack/_internal/cli/main.py +101 -0
- dstack/_internal/cli/models/gateways.py +16 -0
- dstack/_internal/cli/models/offers.py +47 -0
- dstack/_internal/cli/models/runs.py +16 -0
- dstack/_internal/cli/services/args.py +31 -0
- dstack/_internal/cli/services/completion.py +91 -0
- dstack/_internal/cli/services/configurators/__init__.py +86 -0
- dstack/_internal/cli/services/configurators/base.py +103 -0
- dstack/_internal/cli/services/configurators/fleet.py +475 -0
- dstack/_internal/cli/services/configurators/gateway.py +231 -0
- dstack/_internal/cli/services/configurators/run.py +882 -0
- dstack/_internal/cli/services/configurators/volume.py +222 -0
- dstack/_internal/cli/services/events.py +68 -0
- dstack/_internal/cli/services/profile.py +182 -0
- dstack/_internal/cli/services/repos.py +71 -0
- dstack/_internal/cli/services/resources.py +54 -0
- dstack/_internal/cli/utils/common.py +159 -0
- dstack/_internal/cli/utils/fleet.py +106 -0
- dstack/_internal/cli/utils/gateway.py +56 -0
- dstack/_internal/cli/utils/gpu.py +178 -0
- dstack/_internal/cli/utils/rich.py +156 -0
- dstack/_internal/cli/utils/run.py +517 -0
- dstack/_internal/cli/utils/secrets.py +25 -0
- dstack/_internal/cli/utils/updates.py +98 -0
- dstack/_internal/cli/utils/volume.py +58 -0
- dstack/_internal/compat.py +3 -0
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/auth.py +30 -0
- dstack/_internal/core/backends/aws/backend.py +31 -0
- dstack/_internal/core/backends/aws/compute.py +1153 -0
- dstack/_internal/core/backends/aws/configurator.py +191 -0
- dstack/_internal/core/backends/aws/models.py +135 -0
- dstack/_internal/core/backends/aws/resources.py +700 -0
- dstack/_internal/core/backends/azure/auth.py +39 -0
- dstack/_internal/core/backends/azure/backend.py +21 -0
- dstack/_internal/core/backends/azure/compute.py +676 -0
- dstack/_internal/core/backends/azure/configurator.py +472 -0
- dstack/_internal/core/backends/azure/models.py +98 -0
- dstack/_internal/core/backends/azure/resources.py +116 -0
- dstack/_internal/core/backends/azure/utils.py +42 -0
- dstack/_internal/core/backends/base/backend.py +18 -0
- dstack/_internal/core/backends/base/compute.py +1101 -0
- dstack/_internal/core/backends/base/configurator.py +117 -0
- dstack/_internal/core/backends/base/models.py +24 -0
- dstack/_internal/core/backends/base/offers.py +232 -0
- dstack/_internal/core/backends/cloudrift/api_client.py +220 -0
- dstack/_internal/core/backends/cloudrift/backend.py +16 -0
- dstack/_internal/core/backends/cloudrift/compute.py +138 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +72 -0
- dstack/_internal/core/backends/cloudrift/models.py +40 -0
- dstack/_internal/core/backends/configurators.py +181 -0
- dstack/_internal/core/backends/cudo/__init__.py +0 -0
- dstack/_internal/core/backends/cudo/api_client.py +111 -0
- dstack/_internal/core/backends/cudo/backend.py +16 -0
- dstack/_internal/core/backends/cudo/compute.py +174 -0
- dstack/_internal/core/backends/cudo/configurator.py +63 -0
- dstack/_internal/core/backends/cudo/models.py +37 -0
- dstack/_internal/core/backends/datacrunch/__init__.py +1 -0
- dstack/_internal/core/backends/datacrunch/backend.py +18 -0
- dstack/_internal/core/backends/datacrunch/compute.py +8 -0
- dstack/_internal/core/backends/datacrunch/configurator.py +17 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +174 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/dstack/__init__.py +0 -0
- dstack/_internal/core/backends/dstack/models.py +26 -0
- dstack/_internal/core/backends/features.py +74 -0
- dstack/_internal/core/backends/gcp/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/auth.py +57 -0
- dstack/_internal/core/backends/gcp/backend.py +17 -0
- dstack/_internal/core/backends/gcp/compute.py +1257 -0
- dstack/_internal/core/backends/gcp/configurator.py +206 -0
- dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/features/tcpx.py +65 -0
- dstack/_internal/core/backends/gcp/models.py +160 -0
- dstack/_internal/core/backends/gcp/resources.py +585 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +101 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +188 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +66 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/kubernetes/__init__.py +0 -0
- dstack/_internal/core/backends/kubernetes/backend.py +16 -0
- dstack/_internal/core/backends/kubernetes/compute.py +1077 -0
- dstack/_internal/core/backends/kubernetes/configurator.py +61 -0
- dstack/_internal/core/backends/kubernetes/models.py +71 -0
- dstack/_internal/core/backends/kubernetes/utils.py +81 -0
- dstack/_internal/core/backends/lambdalabs/__init__.py +0 -0
- dstack/_internal/core/backends/lambdalabs/api_client.py +87 -0
- dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +233 -0
- dstack/_internal/core/backends/lambdalabs/configurator.py +65 -0
- dstack/_internal/core/backends/lambdalabs/models.py +37 -0
- dstack/_internal/core/backends/local/__init__.py +0 -0
- dstack/_internal/core/backends/local/backend.py +14 -0
- dstack/_internal/core/backends/local/compute.py +130 -0
- dstack/_internal/core/backends/models.py +158 -0
- dstack/_internal/core/backends/nebius/__init__.py +0 -0
- dstack/_internal/core/backends/nebius/backend.py +16 -0
- dstack/_internal/core/backends/nebius/compute.py +401 -0
- dstack/_internal/core/backends/nebius/configurator.py +98 -0
- dstack/_internal/core/backends/nebius/models.py +185 -0
- dstack/_internal/core/backends/nebius/resources.py +433 -0
- dstack/_internal/core/backends/oci/__init__.py +0 -0
- dstack/_internal/core/backends/oci/auth.py +21 -0
- dstack/_internal/core/backends/oci/backend.py +16 -0
- dstack/_internal/core/backends/oci/compute.py +209 -0
- dstack/_internal/core/backends/oci/configurator.py +156 -0
- dstack/_internal/core/backends/oci/exceptions.py +15 -0
- dstack/_internal/core/backends/oci/models.py +87 -0
- dstack/_internal/core/backends/oci/region.py +86 -0
- dstack/_internal/core/backends/oci/resources.py +836 -0
- dstack/_internal/core/backends/runpod/__init__.py +0 -0
- dstack/_internal/core/backends/runpod/api_client.py +627 -0
- dstack/_internal/core/backends/runpod/backend.py +16 -0
- dstack/_internal/core/backends/runpod/compute.py +444 -0
- dstack/_internal/core/backends/runpod/configurator.py +63 -0
- dstack/_internal/core/backends/runpod/models.py +54 -0
- dstack/_internal/core/backends/template/__init__.py +0 -0
- dstack/_internal/core/backends/template/backend.py.jinja +16 -0
- dstack/_internal/core/backends/template/compute.py.jinja +95 -0
- dstack/_internal/core/backends/template/configurator.py.jinja +69 -0
- dstack/_internal/core/backends/template/models.py.jinja +62 -0
- dstack/_internal/core/backends/tensordock/models.py +40 -0
- dstack/_internal/core/backends/vastai/__init__.py +0 -0
- dstack/_internal/core/backends/vastai/api_client.py +143 -0
- dstack/_internal/core/backends/vastai/backend.py +16 -0
- dstack/_internal/core/backends/vastai/compute.py +141 -0
- dstack/_internal/core/backends/vastai/configurator.py +69 -0
- dstack/_internal/core/backends/vastai/models.py +37 -0
- dstack/_internal/core/backends/verda/__init__.py +0 -0
- dstack/_internal/core/backends/verda/backend.py +16 -0
- dstack/_internal/core/backends/verda/compute.py +266 -0
- dstack/_internal/core/backends/verda/configurator.py +73 -0
- dstack/_internal/core/backends/verda/models.py +38 -0
- dstack/_internal/core/backends/vultr/__init__.py +0 -0
- dstack/_internal/core/backends/vultr/api_client.py +116 -0
- dstack/_internal/core/backends/vultr/backend.py +16 -0
- dstack/_internal/core/backends/vultr/compute.py +167 -0
- dstack/_internal/core/backends/vultr/configurator.py +71 -0
- dstack/_internal/core/backends/vultr/models.py +34 -0
- dstack/_internal/core/compatibility/__init__.py +0 -0
- dstack/_internal/core/compatibility/events.py +13 -0
- dstack/_internal/core/compatibility/fleets.py +58 -0
- dstack/_internal/core/compatibility/gateways.py +39 -0
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/logs.py +14 -0
- dstack/_internal/core/compatibility/runs.py +86 -0
- dstack/_internal/core/compatibility/volumes.py +37 -0
- dstack/_internal/core/consts.py +8 -0
- dstack/_internal/core/errors.py +160 -0
- dstack/_internal/core/models/__init__.py +0 -0
- dstack/_internal/core/models/auth.py +28 -0
- dstack/_internal/core/models/backends/__init__.py +0 -0
- dstack/_internal/core/models/backends/base.py +48 -0
- dstack/_internal/core/models/common.py +143 -0
- dstack/_internal/core/models/compute_groups.py +39 -0
- dstack/_internal/core/models/config.py +28 -0
- dstack/_internal/core/models/configurations.py +1123 -0
- dstack/_internal/core/models/envs.py +149 -0
- dstack/_internal/core/models/events.py +98 -0
- dstack/_internal/core/models/files.py +67 -0
- dstack/_internal/core/models/fleets.py +437 -0
- dstack/_internal/core/models/gateways.py +146 -0
- dstack/_internal/core/models/gpus.py +45 -0
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +346 -0
- dstack/_internal/core/models/logs.py +27 -0
- dstack/_internal/core/models/metrics.py +14 -0
- dstack/_internal/core/models/placement.py +27 -0
- dstack/_internal/core/models/profiles.py +431 -0
- dstack/_internal/core/models/projects.py +46 -0
- dstack/_internal/core/models/repos/__init__.py +34 -0
- dstack/_internal/core/models/repos/base.py +36 -0
- dstack/_internal/core/models/repos/local.py +96 -0
- dstack/_internal/core/models/repos/remote.py +341 -0
- dstack/_internal/core/models/repos/virtual.py +85 -0
- dstack/_internal/core/models/resources.py +424 -0
- dstack/_internal/core/models/routers.py +24 -0
- dstack/_internal/core/models/runs.py +618 -0
- dstack/_internal/core/models/secrets.py +16 -0
- dstack/_internal/core/models/server.py +7 -0
- dstack/_internal/core/models/services.py +76 -0
- dstack/_internal/core/models/unix.py +53 -0
- dstack/_internal/core/models/users.py +60 -0
- dstack/_internal/core/models/volumes.py +221 -0
- dstack/_internal/core/services/__init__.py +16 -0
- dstack/_internal/core/services/api_client.py +15 -0
- dstack/_internal/core/services/configs/__init__.py +116 -0
- dstack/_internal/core/services/diff.py +71 -0
- dstack/_internal/core/services/logs.py +58 -0
- dstack/_internal/core/services/profiles.py +46 -0
- dstack/_internal/core/services/repos.py +236 -0
- dstack/_internal/core/services/ssh/__init__.py +27 -0
- dstack/_internal/core/services/ssh/attach.py +241 -0
- dstack/_internal/core/services/ssh/client.py +113 -0
- dstack/_internal/core/services/ssh/key_manager.py +53 -0
- dstack/_internal/core/services/ssh/ports.py +89 -0
- dstack/_internal/core/services/ssh/tunnel.py +337 -0
- dstack/_internal/proxy/__init__.py +8 -0
- dstack/_internal/proxy/gateway/__init__.py +0 -0
- dstack/_internal/proxy/gateway/app.py +89 -0
- dstack/_internal/proxy/gateway/auth.py +26 -0
- dstack/_internal/proxy/gateway/const.py +7 -0
- dstack/_internal/proxy/gateway/deps.py +73 -0
- dstack/_internal/proxy/gateway/main.py +17 -0
- dstack/_internal/proxy/gateway/models.py +23 -0
- dstack/_internal/proxy/gateway/repo/__init__.py +0 -0
- dstack/_internal/proxy/gateway/repo/repo.py +121 -0
- dstack/_internal/proxy/gateway/repo/state_v1.py +164 -0
- dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -0
- dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +27 -0
- dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 +23 -0
- dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +105 -0
- dstack/_internal/proxy/gateway/routers/__init__.py +0 -0
- dstack/_internal/proxy/gateway/routers/auth.py +10 -0
- dstack/_internal/proxy/gateway/routers/config.py +28 -0
- dstack/_internal/proxy/gateway/routers/registry.py +124 -0
- dstack/_internal/proxy/gateway/routers/stats.py +18 -0
- dstack/_internal/proxy/gateway/schemas/__init__.py +0 -0
- dstack/_internal/proxy/gateway/schemas/common.py +5 -0
- dstack/_internal/proxy/gateway/schemas/config.py +9 -0
- dstack/_internal/proxy/gateway/schemas/registry.py +63 -0
- dstack/_internal/proxy/gateway/schemas/stats.py +15 -0
- dstack/_internal/proxy/gateway/services/__init__.py +0 -0
- dstack/_internal/proxy/gateway/services/model_routers/__init__.py +18 -0
- dstack/_internal/proxy/gateway/services/model_routers/base.py +91 -0
- dstack/_internal/proxy/gateway/services/model_routers/sglang.py +269 -0
- dstack/_internal/proxy/gateway/services/nginx.py +455 -0
- dstack/_internal/proxy/gateway/services/registry.py +426 -0
- dstack/_internal/proxy/gateway/services/server_client.py +95 -0
- dstack/_internal/proxy/gateway/services/stats.py +170 -0
- dstack/_internal/proxy/gateway/testing/__init__.py +0 -0
- dstack/_internal/proxy/gateway/testing/common.py +13 -0
- dstack/_internal/proxy/lib/__init__.py +0 -0
- dstack/_internal/proxy/lib/auth.py +7 -0
- dstack/_internal/proxy/lib/deps.py +106 -0
- dstack/_internal/proxy/lib/errors.py +14 -0
- dstack/_internal/proxy/lib/models.py +112 -0
- dstack/_internal/proxy/lib/repo.py +27 -0
- dstack/_internal/proxy/lib/routers/__init__.py +0 -0
- dstack/_internal/proxy/lib/routers/model_proxy.py +102 -0
- dstack/_internal/proxy/lib/schemas/__init__.py +0 -0
- dstack/_internal/proxy/lib/schemas/model_proxy.py +77 -0
- dstack/_internal/proxy/lib/services/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/base.py +18 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py +67 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py +208 -0
- dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py +23 -0
- dstack/_internal/proxy/lib/services/service_connection.py +160 -0
- dstack/_internal/proxy/lib/testing/__init__.py +0 -0
- dstack/_internal/proxy/lib/testing/auth.py +11 -0
- dstack/_internal/proxy/lib/testing/common.py +51 -0
- dstack/_internal/server/__init__.py +0 -0
- dstack/_internal/server/alembic.ini +100 -0
- dstack/_internal/server/app.py +432 -0
- dstack/_internal/server/background/__init__.py +142 -0
- dstack/_internal/server/background/tasks/__init__.py +0 -0
- dstack/_internal/server/background/tasks/common.py +24 -0
- dstack/_internal/server/background/tasks/process_compute_groups.py +167 -0
- dstack/_internal/server/background/tasks/process_events.py +17 -0
- dstack/_internal/server/background/tasks/process_fleets.py +289 -0
- dstack/_internal/server/background/tasks/process_gateways.py +188 -0
- dstack/_internal/server/background/tasks/process_idle_volumes.py +145 -0
- dstack/_internal/server/background/tasks/process_instances.py +1186 -0
- dstack/_internal/server/background/tasks/process_metrics.py +172 -0
- dstack/_internal/server/background/tasks/process_placement_groups.py +104 -0
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +150 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +1238 -0
- dstack/_internal/server/background/tasks/process_runs.py +842 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +1106 -0
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +108 -0
- dstack/_internal/server/background/tasks/process_volumes.py +129 -0
- dstack/_internal/server/compatibility/__init__.py +0 -0
- dstack/_internal/server/compatibility/common.py +20 -0
- dstack/_internal/server/compatibility/gpus.py +22 -0
- dstack/_internal/server/db.py +127 -0
- dstack/_internal/server/deps.py +19 -0
- dstack/_internal/server/main.py +4 -0
- dstack/_internal/server/migrations/__init__.py +0 -0
- dstack/_internal/server/migrations/env.py +112 -0
- dstack/_internal/server/migrations/script.py.mako +28 -0
- dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py +38 -0
- dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py +81 -0
- dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py +45 -0
- dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py +64 -0
- dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py +50 -0
- dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
- dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py +44 -0
- dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py +42 -0
- dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py +31 -0
- dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py +26 -0
- dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py +99 -0
- dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py +32 -0
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/migrations/versions/252d3743b641_.py +40 -0
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py +152 -0
- dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py +34 -0
- dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py +36 -0
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
- dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
- dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py +85 -0
- dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
- dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py +33 -0
- dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py +46 -0
- dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py +34 -0
- dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +144 -0
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
- dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py +67 -0
- dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py +61 -0
- dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py +32 -0
- dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +329 -0
- dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py +31 -0
- dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
- dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py +31 -0
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
- dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
- dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py +32 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py +26 -0
- dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py +54 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
- dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py +68 -0
- dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py +35 -0
- dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
- dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +91 -0
- dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py +32 -0
- dstack/_internal/server/migrations/versions/82b32a135ea2_.py +58 -0
- dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py +93 -0
- dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py +34 -0
- dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py +43 -0
- dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py +36 -0
- dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py +73 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py +71 -0
- dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py +36 -0
- dstack/_internal/server/migrations/versions/__init__.py +0 -0
- dstack/_internal/server/migrations/versions/a060e2440936_.py +206 -0
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py +58 -0
- dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py +102 -0
- dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py +37 -0
- dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py +21 -0
- dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
- dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
- dstack/_internal/server/migrations/versions/bfba43f6def2_.py +32 -0
- dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py +108 -0
- dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py +74 -0
- dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py +43 -0
- dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py +38 -0
- dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py +150 -0
- dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py +106 -0
- dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py +34 -0
- dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py +31 -0
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py +36 -0
- dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py +40 -0
- dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py +36 -0
- dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
- dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py +61 -0
- dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py +72 -0
- dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py +32 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py +29 -0
- dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py +81 -0
- dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
- dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py +81 -0
- dstack/_internal/server/models.py +930 -0
- dstack/_internal/server/routers/__init__.py +0 -0
- dstack/_internal/server/routers/auth.py +34 -0
- dstack/_internal/server/routers/backends.py +142 -0
- dstack/_internal/server/routers/events.py +60 -0
- dstack/_internal/server/routers/files.py +68 -0
- dstack/_internal/server/routers/fleets.py +202 -0
- dstack/_internal/server/routers/gateways.py +109 -0
- dstack/_internal/server/routers/gpus.py +32 -0
- dstack/_internal/server/routers/instances.py +77 -0
- dstack/_internal/server/routers/logs.py +34 -0
- dstack/_internal/server/routers/metrics.py +82 -0
- dstack/_internal/server/routers/projects.py +205 -0
- dstack/_internal/server/routers/prometheus.py +35 -0
- dstack/_internal/server/routers/repos.py +118 -0
- dstack/_internal/server/routers/runs.py +216 -0
- dstack/_internal/server/routers/secrets.py +86 -0
- dstack/_internal/server/routers/server.py +19 -0
- dstack/_internal/server/routers/users.py +158 -0
- dstack/_internal/server/routers/volumes.py +122 -0
- dstack/_internal/server/schemas/__init__.py +0 -0
- dstack/_internal/server/schemas/auth.py +83 -0
- dstack/_internal/server/schemas/backends.py +16 -0
- dstack/_internal/server/schemas/common.py +9 -0
- dstack/_internal/server/schemas/events.py +211 -0
- dstack/_internal/server/schemas/files.py +5 -0
- dstack/_internal/server/schemas/fleets.py +49 -0
- dstack/_internal/server/schemas/gateways.py +31 -0
- dstack/_internal/server/schemas/gpus.py +26 -0
- dstack/_internal/server/schemas/health/__init__.py +0 -0
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +47 -0
- dstack/_internal/server/schemas/logs.py +17 -0
- dstack/_internal/server/schemas/projects.py +81 -0
- dstack/_internal/server/schemas/repos.py +24 -0
- dstack/_internal/server/schemas/runner.py +269 -0
- dstack/_internal/server/schemas/runs.py +66 -0
- dstack/_internal/server/schemas/secrets.py +16 -0
- dstack/_internal/server/schemas/users.py +72 -0
- dstack/_internal/server/schemas/volumes.py +29 -0
- dstack/_internal/server/security/__init__.py +0 -0
- dstack/_internal/server/security/permissions.py +251 -0
- dstack/_internal/server/services/__init__.py +0 -0
- dstack/_internal/server/services/auth.py +77 -0
- dstack/_internal/server/services/backends/__init__.py +404 -0
- dstack/_internal/server/services/backends/handlers.py +105 -0
- dstack/_internal/server/services/compute_groups.py +22 -0
- dstack/_internal/server/services/config.py +279 -0
- dstack/_internal/server/services/docker.py +162 -0
- dstack/_internal/server/services/encryption/__init__.py +102 -0
- dstack/_internal/server/services/encryption/keys/__init__.py +0 -0
- dstack/_internal/server/services/encryption/keys/aes.py +68 -0
- dstack/_internal/server/services/encryption/keys/base.py +19 -0
- dstack/_internal/server/services/encryption/keys/identity.py +28 -0
- dstack/_internal/server/services/events.py +477 -0
- dstack/_internal/server/services/files.py +91 -0
- dstack/_internal/server/services/fleets.py +1224 -0
- dstack/_internal/server/services/gateways/__init__.py +686 -0
- dstack/_internal/server/services/gateways/client.py +209 -0
- dstack/_internal/server/services/gateways/connection.py +139 -0
- dstack/_internal/server/services/gateways/pool.py +58 -0
- dstack/_internal/server/services/gpus.py +387 -0
- dstack/_internal/server/services/instances.py +731 -0
- dstack/_internal/server/services/jobs/__init__.py +840 -0
- dstack/_internal/server/services/jobs/configurators/__init__.py +0 -0
- dstack/_internal/server/services/jobs/configurators/base.py +469 -0
- dstack/_internal/server/services/jobs/configurators/dev.py +69 -0
- dstack/_internal/server/services/jobs/configurators/extensions/__init__.py +0 -0
- dstack/_internal/server/services/jobs/configurators/extensions/base.py +15 -0
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +42 -0
- dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py +43 -0
- dstack/_internal/server/services/jobs/configurators/service.py +28 -0
- dstack/_internal/server/services/jobs/configurators/task.py +39 -0
- dstack/_internal/server/services/locking.py +187 -0
- dstack/_internal/server/services/logging.py +29 -0
- dstack/_internal/server/services/logs/__init__.py +122 -0
- dstack/_internal/server/services/logs/aws.py +373 -0
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +261 -0
- dstack/_internal/server/services/logs/fluentbit.py +329 -0
- dstack/_internal/server/services/logs/gcp.py +181 -0
- dstack/_internal/server/services/metrics.py +172 -0
- dstack/_internal/server/services/offers.py +249 -0
- dstack/_internal/server/services/permissions.py +37 -0
- dstack/_internal/server/services/placement.py +234 -0
- dstack/_internal/server/services/plugins.py +109 -0
- dstack/_internal/server/services/probes.py +10 -0
- dstack/_internal/server/services/projects.py +835 -0
- dstack/_internal/server/services/prometheus/__init__.py +0 -0
- dstack/_internal/server/services/prometheus/client_metrics.py +55 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +327 -0
- dstack/_internal/server/services/proxy/__init__.py +3 -0
- dstack/_internal/server/services/proxy/auth.py +12 -0
- dstack/_internal/server/services/proxy/deps.py +18 -0
- dstack/_internal/server/services/proxy/repo.py +189 -0
- dstack/_internal/server/services/proxy/routers/__init__.py +0 -0
- dstack/_internal/server/services/proxy/routers/service_proxy.py +49 -0
- dstack/_internal/server/services/proxy/services/__init__.py +0 -0
- dstack/_internal/server/services/proxy/services/service_proxy.py +135 -0
- dstack/_internal/server/services/repos.py +362 -0
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +260 -0
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runner/__init__.py +0 -0
- dstack/_internal/server/services/runner/client.py +646 -0
- dstack/_internal/server/services/runner/ssh.py +128 -0
- dstack/_internal/server/services/runs/__init__.py +1026 -0
- dstack/_internal/server/services/runs/plan.py +703 -0
- dstack/_internal/server/services/runs/replicas.py +317 -0
- dstack/_internal/server/services/runs/spec.py +191 -0
- dstack/_internal/server/services/secrets.py +245 -0
- dstack/_internal/server/services/services/__init__.py +345 -0
- dstack/_internal/server/services/services/autoscalers.py +140 -0
- dstack/_internal/server/services/services/options.py +53 -0
- dstack/_internal/server/services/ssh.py +67 -0
- dstack/_internal/server/services/storage/__init__.py +37 -0
- dstack/_internal/server/services/storage/base.py +48 -0
- dstack/_internal/server/services/storage/gcs.py +66 -0
- dstack/_internal/server/services/storage/s3.py +69 -0
- dstack/_internal/server/services/users.py +461 -0
- dstack/_internal/server/services/volumes.py +496 -0
- dstack/_internal/server/settings.py +161 -0
- dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
- dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
- dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
- dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
- dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
- dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
- dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
- dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
- dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
- dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
- dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
- dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
- dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
- dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
- dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
- dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
- dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
- dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
- dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
- dstack/_internal/server/statics/assets/favicon.ico +0 -0
- dstack/{dashboard/statics/assets/manifest.json → _internal/server/statics/assets/manifest.webmanifest} +18 -9
- dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
- dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
- dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
- dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
- dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
- dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
- dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
- dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
- dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
- dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
- dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
- dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
- dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
- dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
- dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
- dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
- dstack/_internal/server/statics/index.html +3 -0
- dstack/_internal/server/statics/logo-notext.svg +116 -0
- dstack/_internal/server/statics/main-2e6967bad9f29395eea6.css +3 -0
- dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js +155547 -0
- dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js.map +1 -0
- dstack/{dashboard → _internal/server}/statics/manifest.json +2 -2
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
- dstack/{dashboard/statics/static/media/logo.f9d7170678f68f796e270698633770ec.svg → _internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg} +8 -6
- dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
- dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
- dstack/_internal/server/testing/__init__.py +0 -0
- dstack/_internal/server/testing/common.py +1220 -0
- dstack/_internal/server/testing/conf.py +53 -0
- dstack/_internal/server/testing/matchers.py +31 -0
- dstack/_internal/server/utils/__init__.py +0 -0
- dstack/_internal/server/utils/common.py +55 -0
- dstack/_internal/server/utils/logging.py +51 -0
- dstack/_internal/server/utils/provisioning.py +368 -0
- dstack/_internal/server/utils/routers.py +166 -0
- dstack/_internal/server/utils/sentry_utils.py +24 -0
- dstack/_internal/settings.py +49 -0
- dstack/_internal/utils/__init__.py +0 -0
- dstack/_internal/utils/common.py +318 -0
- dstack/_internal/utils/cron.py +5 -0
- dstack/_internal/utils/crypto.py +40 -0
- dstack/_internal/utils/env.py +88 -0
- dstack/_internal/utils/event_loop.py +30 -0
- dstack/_internal/utils/files.py +69 -0
- dstack/_internal/utils/gpu.py +59 -0
- dstack/_internal/utils/hash.py +31 -0
- dstack/_internal/utils/interpolator.py +91 -0
- dstack/_internal/utils/json_schema.py +11 -0
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/_internal/utils/logging.py +5 -0
- dstack/_internal/utils/nested_list.py +47 -0
- dstack/_internal/utils/network.py +50 -0
- dstack/_internal/utils/path.py +57 -0
- dstack/_internal/utils/random_names.py +258 -0
- dstack/_internal/utils/ssh.py +346 -0
- dstack/_internal/utils/tags.py +42 -0
- dstack/_internal/utils/typing.py +14 -0
- dstack/_internal/utils/version.py +22 -0
- dstack/api/__init__.py +46 -0
- dstack/api/_public/__init__.py +96 -0
- dstack/api/_public/backends.py +42 -0
- dstack/api/_public/common.py +5 -0
- dstack/api/_public/repos.py +202 -0
- dstack/api/_public/runs.py +714 -0
- dstack/api/server/__init__.py +206 -0
- dstack/api/server/_auth.py +30 -0
- dstack/api/server/_backends.py +38 -0
- dstack/api/server/_events.py +64 -0
- dstack/api/server/_files.py +18 -0
- dstack/api/server/_fleets.py +82 -0
- dstack/api/server/_gateways.py +54 -0
- dstack/api/server/_gpus.py +27 -0
- dstack/api/server/_group.py +22 -0
- dstack/api/server/_logs.py +15 -0
- dstack/api/server/_metrics.py +23 -0
- dstack/api/server/_projects.py +124 -0
- dstack/api/server/_repos.py +64 -0
- dstack/api/server/_runs.py +102 -0
- dstack/api/server/_secrets.py +36 -0
- dstack/api/server/_users.py +82 -0
- dstack/api/server/_volumes.py +39 -0
- dstack/api/server/utils.py +34 -0
- dstack/api/utils.py +105 -0
- dstack/core/__init__.py +0 -0
- dstack/plugins/__init__.py +8 -0
- dstack/plugins/_base.py +72 -0
- dstack/plugins/_models.py +8 -0
- dstack/plugins/_utils.py +19 -0
- dstack/plugins/builtin/__init__.py +0 -0
- dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
- dstack/plugins/builtin/rest_plugin/_models.py +48 -0
- dstack/plugins/builtin/rest_plugin/_plugin.py +147 -0
- dstack/version.py +3 -1
- dstack-0.20.7.dist-info/METADATA +519 -0
- dstack-0.20.7.dist-info/RECORD +720 -0
- {dstack-0.0.9.dist-info → dstack-0.20.7.dist-info}/WHEEL +1 -2
- dstack-0.20.7.dist-info/entry_points.txt +2 -0
- dstack-0.20.7.dist-info/licenses/LICENSE.md +353 -0
- dstack/aws/__init__.py +0 -180
- dstack/aws/artifacts.py +0 -111
- dstack/aws/config.py +0 -40
- dstack/aws/jobs.py +0 -245
- dstack/aws/logs.py +0 -186
- dstack/aws/repos.py +0 -137
- dstack/aws/run_names.py +0 -17
- dstack/aws/runners.py +0 -693
- dstack/aws/runs.py +0 -79
- dstack/aws/secrets.py +0 -99
- dstack/aws/tags.py +0 -138
- dstack/backend.py +0 -299
- dstack/cli/app.py +0 -41
- dstack/cli/artifacts.py +0 -87
- dstack/cli/common.py +0 -57
- dstack/cli/config.py +0 -194
- dstack/cli/dashboard.py +0 -26
- dstack/cli/delete.py +0 -49
- dstack/cli/init.py +0 -33
- dstack/cli/logs.py +0 -87
- dstack/cli/main.py +0 -81
- dstack/cli/restart.py +0 -43
- dstack/cli/run.py +0 -223
- dstack/cli/schema.py +0 -46
- dstack/cli/secrets.py +0 -97
- dstack/cli/status.py +0 -140
- dstack/cli/stop.py +0 -53
- dstack/cli/tags.py +0 -100
- dstack/config.py +0 -80
- dstack/dashboard/artifacts.py +0 -26
- dstack/dashboard/logs.py +0 -73
- dstack/dashboard/main.py +0 -45
- dstack/dashboard/repos.py +0 -41
- dstack/dashboard/runs.py +0 -140
- dstack/dashboard/secrets.py +0 -53
- dstack/dashboard/statics/4d6a4e032505c1efd23c.png +0 -0
- dstack/dashboard/statics/7e018c3e5566d7c349a8.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-144x144.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-192x192.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-256x256.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-36x36.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-384x384.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-48x48.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-512x512.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-72x72.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-96x96.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-1024x1024.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-114x114.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-120x120.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-144x144.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-152x152.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-167x167.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-180x180.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-57x57.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-60x60.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-72x72.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-76x76.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-precomposed.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
- dstack/dashboard/statics/assets/browserconfig.xml +0 -15
- dstack/dashboard/statics/assets/coast-228x228.png +0 -0
- dstack/dashboard/statics/assets/favicon-16x16.png +0 -0
- dstack/dashboard/statics/assets/favicon-32x32.png +0 -0
- dstack/dashboard/statics/assets/favicon-48x48.png +0 -0
- dstack/dashboard/statics/assets/favicon.ico +0 -0
- dstack/dashboard/statics/assets/firefox_app_128x128.png +0 -0
- dstack/dashboard/statics/assets/firefox_app_512x512.png +0 -0
- dstack/dashboard/statics/assets/firefox_app_60x60.png +0 -0
- dstack/dashboard/statics/assets/manifest.webapp +0 -14
- dstack/dashboard/statics/assets/mstile-144x144.png +0 -0
- dstack/dashboard/statics/assets/mstile-150x150.png +0 -0
- dstack/dashboard/statics/assets/mstile-310x150.png +0 -0
- dstack/dashboard/statics/assets/mstile-310x310.png +0 -0
- dstack/dashboard/statics/assets/mstile-70x70.png +0 -0
- dstack/dashboard/statics/assets/yandex-browser-50x50.png +0 -0
- dstack/dashboard/statics/d0f71e48806e25d72553.png +0 -0
- dstack/dashboard/statics/index.html +0 -7
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js +0 -3
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.LICENSE.txt +0 -102
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.map +0 -1
- dstack/dashboard/statics/main.css +0 -5058
- dstack/dashboard/statics/splash_thumbnail.png +0 -0
- dstack/dashboard/statics/static/media/check.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
- dstack/dashboard/statics/static/media/chevron-down.bfd8f22c4a5db4d443e76bca3b02f334.svg +0 -3
- dstack/dashboard/statics/static/media/chevron-up.bade0c5d82d741cead615813264140c9.svg +0 -3
- dstack/dashboard/statics/static/media/clock.583b744f29b9d143718a55e7c35fe38e.svg +0 -3
- dstack/dashboard/statics/static/media/close.a8bb9e47361b03a3b5084dad676ba1da.svg +0 -3
- dstack/dashboard/statics/static/media/content-copy.73f5f2a175094757758e315243a4111e.svg +0 -3
- dstack/dashboard/statics/static/media/delete-outline.6a8abf4e4f9cb777781967efd56efe9b.svg +0 -3
- dstack/dashboard/statics/static/media/dots-vertical.82fc618192e0c7dc4d615ff93269246a.svg +0 -3
- dstack/dashboard/statics/static/media/earth.1ad57c7f59f4be5c8bb2fa00439c3149.svg +0 -3
- dstack/dashboard/statics/static/media/email.320bc3af24a5f1bb41ebd85f66a5dd70.svg +0 -3
- dstack/dashboard/statics/static/media/external-link.99b88e699c15afb820a1779d9a2261ed.svg +0 -3
- dstack/dashboard/statics/static/media/eye-off-outline.5b4afb7ad624a44dd307518ff93d1faa.svg +0 -3
- dstack/dashboard/statics/static/media/eye-outline.ca41708feaaed1edb15c5fff021fbafe.svg +0 -3
- dstack/dashboard/statics/static/media/file-download-outline.3634b41923ba79b297ff294ef898661c.svg +0 -3
- dstack/dashboard/statics/static/media/folder-outline.33378387af61821dd1207e4b2d061a07.svg +0 -3
- dstack/dashboard/statics/static/media/github-circle.1bb85d171c31a3c2eebad07319377171.svg +0 -3
- dstack/dashboard/statics/static/media/infinity.915f92939afc0a37f94adba211ceb172.svg +0 -3
- dstack/dashboard/statics/static/media/layers.b4b02cea267a617d7aa44c2719250c89.svg +0 -3
- dstack/dashboard/statics/static/media/linkedin.1c52fae553eee54397f0e63a79455a5e.svg +0 -3
- dstack/dashboard/statics/static/media/loading.e466be7b2c1f0ac9e7e51ca929d0e37d.svg +0 -3
- dstack/dashboard/statics/static/media/lock.4a4c7768d0fa60c716609ddc483470ef.svg +0 -3
- dstack/dashboard/statics/static/media/magnify.0c803314d039d21f3cb1504ccd1437a4.svg +0 -3
- dstack/dashboard/statics/static/media/mark.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
- dstack/dashboard/statics/static/media/menu-close.3ee84714181017c6ff837830297c8437.svg +0 -3
- dstack/dashboard/statics/static/media/menu.922f81e0972fbcbb5adcd8def20c86a3.svg +0 -3
- dstack/dashboard/statics/static/media/pencil.f706a3b9dcbff4959a91bf72e1e6324f.svg +0 -3
- dstack/dashboard/statics/static/media/refresh.a80edb948e98b322cd73b67814a57a48.svg +0 -3
- dstack/dashboard/statics/static/media/shape-plus.63b093c7f4b44c3def774f30fcfbceca.svg +0 -3
- dstack/dashboard/statics/static/media/slack.ec2fca99c6b944950ac65404ddd26880.svg +0 -4
- dstack/dashboard/statics/static/media/small-logo.b9cc8d09f646a553e65fa336dafd8b10.svg +0 -116
- dstack/dashboard/statics/static/media/source-branch.b8d22cfc42a7bed81f0fc08130818e85.svg +0 -3
- dstack/dashboard/statics/static/media/source-commit.be2bb53c081b9b6836adffccc0b8d3e6.svg +0 -3
- dstack/dashboard/statics/static/media/stop.11488ff1437ad929476be8924a3b7075.svg +0 -3
- dstack/dashboard/statics/static/media/tag-minus.15680a815b0b8d027e973c84832c05e6.svg +0 -3
- dstack/dashboard/statics/static/media/tag-outline.19b0bf86a8afd7d6d9c716e9a91d94ca.svg +0 -3
- dstack/dashboard/statics/static/media/twitter.4af18861c84a2f3044c7546b55d5739c.svg +0 -3
- dstack/dashboard/tags.py +0 -119
- dstack/jobs.py +0 -255
- dstack/providers/__init__.py +0 -316
- dstack/providers/_python/main.py +0 -88
- dstack/providers/_tensorboard/main.py +0 -93
- dstack/providers/_torchrun/main.py +0 -121
- dstack/providers/bash/main.py +0 -90
- dstack/providers/code/main.py +0 -95
- dstack/providers/docker/main.py +0 -79
- dstack/providers/lab/main.py +0 -95
- dstack/providers/notebook/main.py +0 -90
- dstack/random_name.py +0 -29
- dstack/repo.py +0 -135
- dstack/runners.py +0 -35
- dstack/util.py +0 -15
- dstack-0.0.9.dist-info/METADATA +0 -176
- dstack-0.0.9.dist-info/RECORD +0 -179
- dstack-0.0.9.dist-info/entry_points.txt +0 -3
- dstack-0.0.9.dist-info/top_level.txt +0 -2
- tests/test_config.py +0 -70
- /dstack/{cli → _internal}/__init__.py +0 -0
- /dstack/{dashboard → _internal/cli}/__init__.py +0 -0
- /dstack/{providers/_python → _internal/cli/models}/__init__.py +0 -0
- /dstack/{providers/_tensorboard → _internal/cli/services}/__init__.py +0 -0
- /dstack/{providers/_torchrun → _internal/cli/utils}/__init__.py +0 -0
- /dstack/{providers/bash → _internal/core}/__init__.py +0 -0
- /dstack/{providers/code → _internal/core/backends}/__init__.py +0 -0
- /dstack/{providers/docker → _internal/core/backends/aws}/__init__.py +0 -0
- /dstack/{providers/lab → _internal/core/backends/azure}/__init__.py +0 -0
- /dstack/{providers/notebook → _internal/core/backends/base}/__init__.py +0 -0
- {tests → dstack/_internal/core/backends/cloudrift}/__init__.py +0 -0
- /dstack/{dashboard → _internal/server}/statics/assets/yandex-browser-manifest.json +0 -0
- /dstack/{dashboard → _internal/server}/statics/robots.txt +0 -0
|
@@ -0,0 +1,1186 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime
|
|
3
|
+
import logging
|
|
4
|
+
from datetime import timedelta
|
|
5
|
+
from typing import Any, Dict, Optional, cast
|
|
6
|
+
|
|
7
|
+
import gpuhunt
|
|
8
|
+
import requests
|
|
9
|
+
from paramiko.pkey import PKey
|
|
10
|
+
from paramiko.ssh_exception import PasswordRequiredException
|
|
11
|
+
from pydantic import ValidationError
|
|
12
|
+
from sqlalchemy import and_, delete, func, not_, select
|
|
13
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
14
|
+
from sqlalchemy.orm import joinedload
|
|
15
|
+
|
|
16
|
+
from dstack._internal import settings
|
|
17
|
+
from dstack._internal.core.backends.base.compute import (
|
|
18
|
+
ComputeWithCreateInstanceSupport,
|
|
19
|
+
ComputeWithPlacementGroupSupport,
|
|
20
|
+
GoArchType,
|
|
21
|
+
get_dstack_runner_binary_path,
|
|
22
|
+
get_dstack_runner_download_url,
|
|
23
|
+
get_dstack_runner_version,
|
|
24
|
+
get_dstack_shim_binary_path,
|
|
25
|
+
get_dstack_shim_download_url,
|
|
26
|
+
get_dstack_shim_version,
|
|
27
|
+
get_dstack_working_dir,
|
|
28
|
+
get_shim_env,
|
|
29
|
+
get_shim_pre_start_commands,
|
|
30
|
+
)
|
|
31
|
+
from dstack._internal.core.backends.features import (
|
|
32
|
+
BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
|
|
33
|
+
BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT,
|
|
34
|
+
)
|
|
35
|
+
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
36
|
+
|
|
37
|
+
# FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
|
|
38
|
+
from dstack._internal.core.errors import (
|
|
39
|
+
BackendError,
|
|
40
|
+
NotYetTerminated,
|
|
41
|
+
ProvisioningError,
|
|
42
|
+
)
|
|
43
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
44
|
+
from dstack._internal.core.models.fleets import InstanceGroupPlacement
|
|
45
|
+
from dstack._internal.core.models.instances import (
|
|
46
|
+
HealthStatus,
|
|
47
|
+
InstanceAvailability,
|
|
48
|
+
InstanceOfferWithAvailability,
|
|
49
|
+
InstanceRuntime,
|
|
50
|
+
InstanceStatus,
|
|
51
|
+
InstanceTerminationReason,
|
|
52
|
+
RemoteConnectionInfo,
|
|
53
|
+
SSHKey,
|
|
54
|
+
)
|
|
55
|
+
from dstack._internal.core.models.profiles import (
|
|
56
|
+
TerminationPolicy,
|
|
57
|
+
)
|
|
58
|
+
from dstack._internal.core.models.runs import (
|
|
59
|
+
JobProvisioningData,
|
|
60
|
+
)
|
|
61
|
+
from dstack._internal.server import settings as server_settings
|
|
62
|
+
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
|
|
63
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
64
|
+
from dstack._internal.server.models import (
|
|
65
|
+
FleetModel,
|
|
66
|
+
InstanceHealthCheckModel,
|
|
67
|
+
InstanceModel,
|
|
68
|
+
JobModel,
|
|
69
|
+
ProjectModel,
|
|
70
|
+
)
|
|
71
|
+
from dstack._internal.server.schemas.instances import InstanceCheck
|
|
72
|
+
from dstack._internal.server.schemas.runner import (
|
|
73
|
+
ComponentInfo,
|
|
74
|
+
ComponentStatus,
|
|
75
|
+
HealthcheckResponse,
|
|
76
|
+
InstanceHealthResponse,
|
|
77
|
+
)
|
|
78
|
+
from dstack._internal.server.services import backends as backends_services
|
|
79
|
+
from dstack._internal.server.services import events
|
|
80
|
+
from dstack._internal.server.services.fleets import (
|
|
81
|
+
fleet_model_to_fleet,
|
|
82
|
+
get_create_instance_offers,
|
|
83
|
+
is_cloud_cluster,
|
|
84
|
+
)
|
|
85
|
+
from dstack._internal.server.services.instances import (
|
|
86
|
+
get_instance_configuration,
|
|
87
|
+
get_instance_profile,
|
|
88
|
+
get_instance_provisioning_data,
|
|
89
|
+
get_instance_requirements,
|
|
90
|
+
get_instance_ssh_private_keys,
|
|
91
|
+
remove_dangling_tasks_from_instance,
|
|
92
|
+
switch_instance_status,
|
|
93
|
+
)
|
|
94
|
+
from dstack._internal.server.services.locking import get_locker
|
|
95
|
+
from dstack._internal.server.services.logging import fmt
|
|
96
|
+
from dstack._internal.server.services.offers import (
|
|
97
|
+
get_instance_offer_with_restricted_az,
|
|
98
|
+
is_divisible_into_blocks,
|
|
99
|
+
)
|
|
100
|
+
from dstack._internal.server.services.placement import (
|
|
101
|
+
find_or_create_suitable_placement_group,
|
|
102
|
+
get_fleet_placement_group_models,
|
|
103
|
+
get_placement_group_model_for_instance,
|
|
104
|
+
placement_group_model_to_placement_group_optional,
|
|
105
|
+
schedule_fleet_placement_groups_deletion,
|
|
106
|
+
)
|
|
107
|
+
from dstack._internal.server.services.runner import client as runner_client
|
|
108
|
+
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
109
|
+
from dstack._internal.server.utils import sentry_utils
|
|
110
|
+
from dstack._internal.server.utils.provisioning import (
|
|
111
|
+
detect_cpu_arch,
|
|
112
|
+
get_host_info,
|
|
113
|
+
get_paramiko_connection,
|
|
114
|
+
get_shim_healthcheck,
|
|
115
|
+
host_info_to_instance_type,
|
|
116
|
+
remove_dstack_runner_if_exists,
|
|
117
|
+
remove_host_info_if_exists,
|
|
118
|
+
run_pre_start_commands,
|
|
119
|
+
run_shim_as_systemd_service,
|
|
120
|
+
upload_envs,
|
|
121
|
+
)
|
|
122
|
+
from dstack._internal.utils.common import (
|
|
123
|
+
get_current_datetime,
|
|
124
|
+
get_or_error,
|
|
125
|
+
run_async,
|
|
126
|
+
)
|
|
127
|
+
from dstack._internal.utils.logging import get_logger
|
|
128
|
+
from dstack._internal.utils.network import get_ip_from_network, is_ip_among_addresses
|
|
129
|
+
from dstack._internal.utils.ssh import (
|
|
130
|
+
pkey_from_str,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=10)
|
|
134
|
+
|
|
135
|
+
PENDING_JOB_RETRY_INTERVAL = timedelta(seconds=60)
|
|
136
|
+
|
|
137
|
+
TERMINATION_DEADLINE_OFFSET = timedelta(minutes=20)
|
|
138
|
+
TERMINATION_RETRY_TIMEOUT = timedelta(seconds=30)
|
|
139
|
+
TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15)
|
|
140
|
+
PROVISIONING_TIMEOUT_SECONDS = 10 * 60 # 10 minutes in seconds
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
logger = get_logger(__name__)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
async def process_instances(batch_size: int = 1):
|
|
147
|
+
tasks = []
|
|
148
|
+
for _ in range(batch_size):
|
|
149
|
+
tasks.append(_process_next_instance())
|
|
150
|
+
await asyncio.gather(*tasks)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@sentry_utils.instrument_background_task
|
|
154
|
+
async def delete_instance_health_checks():
|
|
155
|
+
now = get_current_datetime()
|
|
156
|
+
cutoff = now - timedelta(seconds=server_settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS)
|
|
157
|
+
async with get_session_ctx() as session:
|
|
158
|
+
await session.execute(
|
|
159
|
+
delete(InstanceHealthCheckModel).where(InstanceHealthCheckModel.collected_at < cutoff)
|
|
160
|
+
)
|
|
161
|
+
await session.commit()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@sentry_utils.instrument_background_task
|
|
165
|
+
async def _process_next_instance():
|
|
166
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__)
|
|
167
|
+
async with get_session_ctx() as session:
|
|
168
|
+
async with lock:
|
|
169
|
+
res = await session.execute(
|
|
170
|
+
select(InstanceModel)
|
|
171
|
+
.where(
|
|
172
|
+
InstanceModel.status.in_(
|
|
173
|
+
[
|
|
174
|
+
InstanceStatus.PENDING,
|
|
175
|
+
InstanceStatus.PROVISIONING,
|
|
176
|
+
InstanceStatus.BUSY,
|
|
177
|
+
InstanceStatus.IDLE,
|
|
178
|
+
InstanceStatus.TERMINATING,
|
|
179
|
+
]
|
|
180
|
+
),
|
|
181
|
+
# Terminating instances belonging to a compute group
|
|
182
|
+
# are handled by process_compute_groups.
|
|
183
|
+
not_(
|
|
184
|
+
and_(
|
|
185
|
+
InstanceModel.status == InstanceStatus.TERMINATING,
|
|
186
|
+
InstanceModel.compute_group_id.is_not(None),
|
|
187
|
+
)
|
|
188
|
+
),
|
|
189
|
+
InstanceModel.id.not_in(lockset),
|
|
190
|
+
InstanceModel.last_processed_at
|
|
191
|
+
< get_current_datetime() - MIN_PROCESSING_INTERVAL,
|
|
192
|
+
)
|
|
193
|
+
.options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status))
|
|
194
|
+
.options(joinedload(InstanceModel.project).load_only(ProjectModel.ssh_private_key))
|
|
195
|
+
.order_by(InstanceModel.last_processed_at.asc())
|
|
196
|
+
.limit(1)
|
|
197
|
+
.with_for_update(skip_locked=True, key_share=True, of=InstanceModel)
|
|
198
|
+
)
|
|
199
|
+
instance = res.scalar()
|
|
200
|
+
if instance is None:
|
|
201
|
+
return
|
|
202
|
+
lockset.add(instance.id)
|
|
203
|
+
instance_model_id = instance.id
|
|
204
|
+
try:
|
|
205
|
+
await _process_instance(session=session, instance=instance)
|
|
206
|
+
finally:
|
|
207
|
+
lockset.difference_update([instance_model_id])
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
async def _process_instance(session: AsyncSession, instance: InstanceModel):
|
|
211
|
+
logger.debug("%s: processing instance, status: %s", fmt(instance), instance.status.upper())
|
|
212
|
+
# Refetch to load related attributes.
|
|
213
|
+
# Load related attributes only for statuses that always need them.
|
|
214
|
+
if instance.status in (
|
|
215
|
+
InstanceStatus.PENDING,
|
|
216
|
+
InstanceStatus.TERMINATING,
|
|
217
|
+
):
|
|
218
|
+
res = await session.execute(
|
|
219
|
+
select(InstanceModel)
|
|
220
|
+
.where(InstanceModel.id == instance.id)
|
|
221
|
+
.options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends))
|
|
222
|
+
.options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status))
|
|
223
|
+
.options(
|
|
224
|
+
joinedload(InstanceModel.fleet).joinedload(
|
|
225
|
+
FleetModel.instances.and_(InstanceModel.deleted == False)
|
|
226
|
+
),
|
|
227
|
+
)
|
|
228
|
+
.execution_options(populate_existing=True)
|
|
229
|
+
)
|
|
230
|
+
instance = res.unique().scalar_one()
|
|
231
|
+
elif instance.status == InstanceStatus.IDLE:
|
|
232
|
+
res = await session.execute(
|
|
233
|
+
select(InstanceModel)
|
|
234
|
+
.where(InstanceModel.id == instance.id)
|
|
235
|
+
.options(joinedload(InstanceModel.project))
|
|
236
|
+
.options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status))
|
|
237
|
+
.options(
|
|
238
|
+
joinedload(InstanceModel.fleet).joinedload(
|
|
239
|
+
FleetModel.instances.and_(InstanceModel.deleted == False)
|
|
240
|
+
),
|
|
241
|
+
)
|
|
242
|
+
.execution_options(populate_existing=True)
|
|
243
|
+
)
|
|
244
|
+
instance = res.unique().scalar_one()
|
|
245
|
+
|
|
246
|
+
if instance.status == InstanceStatus.PENDING:
|
|
247
|
+
if instance.remote_connection_info is not None:
|
|
248
|
+
await _add_remote(session, instance)
|
|
249
|
+
else:
|
|
250
|
+
await _create_instance(
|
|
251
|
+
session=session,
|
|
252
|
+
instance=instance,
|
|
253
|
+
)
|
|
254
|
+
elif instance.status in (
|
|
255
|
+
InstanceStatus.PROVISIONING,
|
|
256
|
+
InstanceStatus.IDLE,
|
|
257
|
+
InstanceStatus.BUSY,
|
|
258
|
+
):
|
|
259
|
+
idle_duration_expired = _check_and_mark_terminating_if_idle_duration_expired(
|
|
260
|
+
session, instance
|
|
261
|
+
)
|
|
262
|
+
if not idle_duration_expired:
|
|
263
|
+
await _check_instance(session, instance)
|
|
264
|
+
elif instance.status == InstanceStatus.TERMINATING:
|
|
265
|
+
await _terminate(session, instance)
|
|
266
|
+
|
|
267
|
+
instance.last_processed_at = get_current_datetime()
|
|
268
|
+
await session.commit()
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _check_and_mark_terminating_if_idle_duration_expired(
|
|
272
|
+
session: AsyncSession, instance: InstanceModel
|
|
273
|
+
):
|
|
274
|
+
if not (
|
|
275
|
+
instance.status == InstanceStatus.IDLE
|
|
276
|
+
and instance.termination_policy == TerminationPolicy.DESTROY_AFTER_IDLE
|
|
277
|
+
and not instance.jobs
|
|
278
|
+
):
|
|
279
|
+
return False
|
|
280
|
+
if instance.fleet is not None and not _can_terminate_fleet_instances_on_idle_duration(
|
|
281
|
+
instance.fleet
|
|
282
|
+
):
|
|
283
|
+
logger.debug(
|
|
284
|
+
"Skipping instance %s termination on idle duration. Fleet is already at `nodes.min`.",
|
|
285
|
+
instance.name,
|
|
286
|
+
)
|
|
287
|
+
return False
|
|
288
|
+
idle_duration = _get_instance_idle_duration(instance)
|
|
289
|
+
idle_seconds = instance.termination_idle_time
|
|
290
|
+
delta = datetime.timedelta(seconds=idle_seconds)
|
|
291
|
+
if idle_duration > delta:
|
|
292
|
+
instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT
|
|
293
|
+
instance.termination_reason_message = f"Instance idle for {idle_duration.seconds}s"
|
|
294
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
|
|
295
|
+
return True
|
|
296
|
+
return False
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _can_terminate_fleet_instances_on_idle_duration(fleet_model: FleetModel) -> bool:
|
|
300
|
+
# Do not terminate instances on idle duration if fleet is already at `nodes.min`.
|
|
301
|
+
# This is an optimization to avoid terminate-create loop.
|
|
302
|
+
# There may be race conditions since we don't take the fleet lock.
|
|
303
|
+
# That's ok: in the worst case we go below `nodes.min`, but
|
|
304
|
+
# the fleet consolidation logic will provision new nodes.
|
|
305
|
+
fleet = fleet_model_to_fleet(fleet_model)
|
|
306
|
+
if fleet.spec.configuration.nodes is None or fleet.spec.autocreated:
|
|
307
|
+
return True
|
|
308
|
+
active_instances = [i for i in fleet_model.instances if i.status.is_active()]
|
|
309
|
+
active_instances_num = len(active_instances)
|
|
310
|
+
return active_instances_num > fleet.spec.configuration.nodes.min
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
async def _add_remote(session: AsyncSession, instance: InstanceModel) -> None:
|
|
314
|
+
logger.info("Adding ssh instance %s...", instance.name)
|
|
315
|
+
|
|
316
|
+
retry_duration_deadline = instance.created_at + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
|
|
317
|
+
if retry_duration_deadline < get_current_datetime():
|
|
318
|
+
instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT
|
|
319
|
+
instance.termination_reason_message = (
|
|
320
|
+
f"Failed to add SSH instance in {PROVISIONING_TIMEOUT_SECONDS}s"
|
|
321
|
+
)
|
|
322
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATED)
|
|
323
|
+
return
|
|
324
|
+
|
|
325
|
+
try:
|
|
326
|
+
remote_details = RemoteConnectionInfo.parse_raw(cast(str, instance.remote_connection_info))
|
|
327
|
+
# Prepare connection key
|
|
328
|
+
try:
|
|
329
|
+
pkeys = _ssh_keys_to_pkeys(remote_details.ssh_keys)
|
|
330
|
+
if remote_details.ssh_proxy_keys is not None:
|
|
331
|
+
ssh_proxy_pkeys = _ssh_keys_to_pkeys(remote_details.ssh_proxy_keys)
|
|
332
|
+
else:
|
|
333
|
+
ssh_proxy_pkeys = None
|
|
334
|
+
except (ValueError, PasswordRequiredException):
|
|
335
|
+
instance.termination_reason = InstanceTerminationReason.ERROR
|
|
336
|
+
instance.termination_reason_message = "Unsupported private SSH key type"
|
|
337
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATED)
|
|
338
|
+
return
|
|
339
|
+
|
|
340
|
+
authorized_keys = [pk.public.strip() for pk in remote_details.ssh_keys]
|
|
341
|
+
authorized_keys.append(instance.project.ssh_public_key.strip())
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
future = run_async(
|
|
345
|
+
_deploy_instance, remote_details, pkeys, ssh_proxy_pkeys, authorized_keys
|
|
346
|
+
)
|
|
347
|
+
deploy_timeout = 20 * 60 # 20 minutes
|
|
348
|
+
result = await asyncio.wait_for(future, timeout=deploy_timeout)
|
|
349
|
+
health, host_info, arch = result
|
|
350
|
+
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
351
|
+
raise ProvisioningError(f"Deploy timeout: {e}") from e
|
|
352
|
+
except Exception as e:
|
|
353
|
+
raise ProvisioningError(f"Deploy instance raised an error: {e}") from e
|
|
354
|
+
except ProvisioningError as e:
|
|
355
|
+
logger.warning(
|
|
356
|
+
"Provisioning instance %s could not be completed because of the error: %s",
|
|
357
|
+
instance.name,
|
|
358
|
+
e,
|
|
359
|
+
)
|
|
360
|
+
# Stays in PENDING, may retry later
|
|
361
|
+
return
|
|
362
|
+
|
|
363
|
+
instance_type = host_info_to_instance_type(host_info, arch)
|
|
364
|
+
instance_network = None
|
|
365
|
+
internal_ip = None
|
|
366
|
+
try:
|
|
367
|
+
default_jpd = JobProvisioningData.__response__.parse_raw(instance.job_provisioning_data)
|
|
368
|
+
instance_network = default_jpd.instance_network
|
|
369
|
+
internal_ip = default_jpd.internal_ip
|
|
370
|
+
except ValidationError:
|
|
371
|
+
pass
|
|
372
|
+
|
|
373
|
+
host_network_addresses = host_info.get("addresses", [])
|
|
374
|
+
if internal_ip is None:
|
|
375
|
+
internal_ip = get_ip_from_network(
|
|
376
|
+
network=instance_network,
|
|
377
|
+
addresses=host_network_addresses,
|
|
378
|
+
)
|
|
379
|
+
if instance_network is not None and internal_ip is None:
|
|
380
|
+
instance.termination_reason = InstanceTerminationReason.ERROR
|
|
381
|
+
instance.termination_reason_message = (
|
|
382
|
+
"Failed to locate internal IP address on the given network"
|
|
383
|
+
)
|
|
384
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATED)
|
|
385
|
+
return
|
|
386
|
+
if internal_ip is not None:
|
|
387
|
+
if not is_ip_among_addresses(ip_address=internal_ip, addresses=host_network_addresses):
|
|
388
|
+
instance.termination_reason = InstanceTerminationReason.ERROR
|
|
389
|
+
instance.termination_reason_message = (
|
|
390
|
+
"Specified internal IP not found among instance interfaces"
|
|
391
|
+
)
|
|
392
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATED)
|
|
393
|
+
return
|
|
394
|
+
|
|
395
|
+
divisible, blocks = is_divisible_into_blocks(
|
|
396
|
+
cpu_count=instance_type.resources.cpus,
|
|
397
|
+
gpu_count=len(instance_type.resources.gpus),
|
|
398
|
+
blocks="auto" if instance.total_blocks is None else instance.total_blocks,
|
|
399
|
+
)
|
|
400
|
+
if divisible:
|
|
401
|
+
instance.total_blocks = blocks
|
|
402
|
+
else:
|
|
403
|
+
instance.termination_reason = InstanceTerminationReason.ERROR
|
|
404
|
+
instance.termination_reason_message = "Cannot split into blocks"
|
|
405
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATED)
|
|
406
|
+
return
|
|
407
|
+
|
|
408
|
+
region = instance.region
|
|
409
|
+
assert region is not None # always set for ssh instances
|
|
410
|
+
jpd = JobProvisioningData(
|
|
411
|
+
backend=BackendType.REMOTE,
|
|
412
|
+
instance_type=instance_type,
|
|
413
|
+
instance_id="instance_id",
|
|
414
|
+
hostname=remote_details.host,
|
|
415
|
+
region=region,
|
|
416
|
+
price=0,
|
|
417
|
+
internal_ip=internal_ip,
|
|
418
|
+
instance_network=instance_network,
|
|
419
|
+
username=remote_details.ssh_user,
|
|
420
|
+
ssh_port=remote_details.port,
|
|
421
|
+
dockerized=True,
|
|
422
|
+
backend_data=None,
|
|
423
|
+
ssh_proxy=remote_details.ssh_proxy,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
switch_instance_status(
|
|
427
|
+
session, instance, InstanceStatus.IDLE if health else InstanceStatus.PROVISIONING
|
|
428
|
+
)
|
|
429
|
+
instance.backend = BackendType.REMOTE
|
|
430
|
+
instance_offer = InstanceOfferWithAvailability(
|
|
431
|
+
backend=BackendType.REMOTE,
|
|
432
|
+
instance=instance_type,
|
|
433
|
+
region=region,
|
|
434
|
+
price=0,
|
|
435
|
+
availability=InstanceAvailability.AVAILABLE,
|
|
436
|
+
instance_runtime=InstanceRuntime.SHIM,
|
|
437
|
+
)
|
|
438
|
+
instance.price = 0
|
|
439
|
+
instance.offer = instance_offer.json()
|
|
440
|
+
instance.job_provisioning_data = jpd.json()
|
|
441
|
+
instance.started_at = get_current_datetime()
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _deploy_instance(
|
|
445
|
+
remote_details: RemoteConnectionInfo,
|
|
446
|
+
pkeys: list[PKey],
|
|
447
|
+
ssh_proxy_pkeys: Optional[list[PKey]],
|
|
448
|
+
authorized_keys: list[str],
|
|
449
|
+
) -> tuple[InstanceCheck, dict[str, Any], GoArchType]:
|
|
450
|
+
with get_paramiko_connection(
|
|
451
|
+
remote_details.ssh_user,
|
|
452
|
+
remote_details.host,
|
|
453
|
+
remote_details.port,
|
|
454
|
+
pkeys,
|
|
455
|
+
remote_details.ssh_proxy,
|
|
456
|
+
ssh_proxy_pkeys,
|
|
457
|
+
) as client:
|
|
458
|
+
logger.info(f"Connected to {remote_details.ssh_user} {remote_details.host}")
|
|
459
|
+
|
|
460
|
+
arch = detect_cpu_arch(client)
|
|
461
|
+
logger.info("%s: CPU arch is %s", remote_details.host, arch)
|
|
462
|
+
|
|
463
|
+
# Execute pre start commands
|
|
464
|
+
shim_pre_start_commands = get_shim_pre_start_commands(arch=arch)
|
|
465
|
+
run_pre_start_commands(client, shim_pre_start_commands, authorized_keys)
|
|
466
|
+
logger.debug("The script for installing dstack has been executed")
|
|
467
|
+
|
|
468
|
+
# Upload envs
|
|
469
|
+
shim_envs = get_shim_env(arch=arch)
|
|
470
|
+
try:
|
|
471
|
+
fleet_configuration_envs = remote_details.env.as_dict()
|
|
472
|
+
except ValueError as e:
|
|
473
|
+
raise ProvisioningError(f"Invalid Env: {e}") from e
|
|
474
|
+
shim_envs.update(fleet_configuration_envs)
|
|
475
|
+
dstack_working_dir = get_dstack_working_dir()
|
|
476
|
+
dstack_shim_binary_path = get_dstack_shim_binary_path()
|
|
477
|
+
dstack_runner_binary_path = get_dstack_runner_binary_path()
|
|
478
|
+
upload_envs(client, dstack_working_dir, shim_envs)
|
|
479
|
+
logger.debug("The dstack-shim environment variables have been installed")
|
|
480
|
+
|
|
481
|
+
# Ensure we have fresh versions of host info.json and dstack-runner
|
|
482
|
+
remove_host_info_if_exists(client, dstack_working_dir)
|
|
483
|
+
remove_dstack_runner_if_exists(client, dstack_runner_binary_path)
|
|
484
|
+
|
|
485
|
+
# Run dstack-shim as a systemd service
|
|
486
|
+
run_shim_as_systemd_service(
|
|
487
|
+
client=client,
|
|
488
|
+
binary_path=dstack_shim_binary_path,
|
|
489
|
+
working_dir=dstack_working_dir,
|
|
490
|
+
dev=settings.DSTACK_VERSION is None,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
# Get host info
|
|
494
|
+
host_info = get_host_info(client, dstack_working_dir)
|
|
495
|
+
logger.debug("Received a host_info %s", host_info)
|
|
496
|
+
|
|
497
|
+
healthcheck_out = get_shim_healthcheck(client)
|
|
498
|
+
try:
|
|
499
|
+
healthcheck = HealthcheckResponse.__response__.parse_raw(healthcheck_out)
|
|
500
|
+
except ValueError as e:
|
|
501
|
+
raise ProvisioningError(f"Cannot parse HealthcheckResponse: {e}") from e
|
|
502
|
+
instance_check = runner_client.healthcheck_response_to_instance_check(healthcheck)
|
|
503
|
+
|
|
504
|
+
return instance_check, host_info, arch
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
|
|
508
|
+
master_instance = await _get_fleet_master_instance(session, instance)
|
|
509
|
+
if _need_to_wait_fleet_provisioning(instance, master_instance):
|
|
510
|
+
logger.debug(
|
|
511
|
+
"%s: waiting for the first instance in the fleet to be provisioned", fmt(instance)
|
|
512
|
+
)
|
|
513
|
+
return
|
|
514
|
+
|
|
515
|
+
try:
|
|
516
|
+
instance_configuration = get_instance_configuration(instance)
|
|
517
|
+
profile = get_instance_profile(instance)
|
|
518
|
+
requirements = get_instance_requirements(instance)
|
|
519
|
+
except ValidationError as e:
|
|
520
|
+
instance.termination_reason = InstanceTerminationReason.ERROR
|
|
521
|
+
instance.termination_reason_message = (
|
|
522
|
+
f"Error to parse profile, requirements or instance_configuration: {e}"
|
|
523
|
+
)
|
|
524
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATED)
|
|
525
|
+
logger.exception(
|
|
526
|
+
"%s: error parsing profile, requirements or instance configuration", fmt(instance)
|
|
527
|
+
)
|
|
528
|
+
return
|
|
529
|
+
|
|
530
|
+
# The placement group is determined when provisioning the master instance
|
|
531
|
+
# and used for all other instances in the fleet.
|
|
532
|
+
placement_group_models = await get_fleet_placement_group_models(
|
|
533
|
+
session=session,
|
|
534
|
+
fleet_id=instance.fleet_id,
|
|
535
|
+
)
|
|
536
|
+
placement_group_model = get_placement_group_model_for_instance(
|
|
537
|
+
placement_group_models=placement_group_models,
|
|
538
|
+
instance_model=instance,
|
|
539
|
+
master_instance_model=master_instance,
|
|
540
|
+
)
|
|
541
|
+
offers = await get_create_instance_offers(
|
|
542
|
+
project=instance.project,
|
|
543
|
+
profile=profile,
|
|
544
|
+
requirements=requirements,
|
|
545
|
+
fleet_model=instance.fleet,
|
|
546
|
+
placement_group=placement_group_model_to_placement_group_optional(placement_group_model),
|
|
547
|
+
blocks="auto" if instance.total_blocks is None else instance.total_blocks,
|
|
548
|
+
exclude_not_available=True,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# Limit number of offers tried to prevent long-running processing
|
|
552
|
+
# in case all offers fail.
|
|
553
|
+
for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]:
|
|
554
|
+
if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT:
|
|
555
|
+
continue
|
|
556
|
+
compute = backend.compute()
|
|
557
|
+
assert isinstance(compute, ComputeWithCreateInstanceSupport)
|
|
558
|
+
instance_offer = _get_instance_offer_for_instance(
|
|
559
|
+
instance_offer=instance_offer,
|
|
560
|
+
instance=instance,
|
|
561
|
+
master_instance=master_instance,
|
|
562
|
+
)
|
|
563
|
+
if (
|
|
564
|
+
instance.fleet
|
|
565
|
+
and is_cloud_cluster(instance.fleet)
|
|
566
|
+
and instance.id == master_instance.id
|
|
567
|
+
and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT
|
|
568
|
+
and isinstance(compute, ComputeWithPlacementGroupSupport)
|
|
569
|
+
and (
|
|
570
|
+
compute.are_placement_groups_compatible_with_reservations(instance_offer.backend)
|
|
571
|
+
or instance_configuration.reservation is None
|
|
572
|
+
)
|
|
573
|
+
):
|
|
574
|
+
placement_group_model = await find_or_create_suitable_placement_group(
|
|
575
|
+
fleet_model=instance.fleet,
|
|
576
|
+
placement_groups=placement_group_models,
|
|
577
|
+
instance_offer=instance_offer,
|
|
578
|
+
compute=compute,
|
|
579
|
+
)
|
|
580
|
+
if placement_group_model is None: # error occurred
|
|
581
|
+
continue
|
|
582
|
+
session.add(placement_group_model)
|
|
583
|
+
placement_group_models.append(placement_group_model)
|
|
584
|
+
logger.debug(
|
|
585
|
+
"Trying %s in %s/%s for $%0.4f per hour",
|
|
586
|
+
instance_offer.instance.name,
|
|
587
|
+
instance_offer.backend.value,
|
|
588
|
+
instance_offer.region,
|
|
589
|
+
instance_offer.price,
|
|
590
|
+
)
|
|
591
|
+
try:
|
|
592
|
+
job_provisioning_data = await run_async(
|
|
593
|
+
compute.create_instance,
|
|
594
|
+
instance_offer,
|
|
595
|
+
instance_configuration,
|
|
596
|
+
placement_group_model_to_placement_group_optional(placement_group_model),
|
|
597
|
+
)
|
|
598
|
+
except BackendError as e:
|
|
599
|
+
logger.warning(
|
|
600
|
+
"%s launch in %s/%s failed: %s",
|
|
601
|
+
instance_offer.instance.name,
|
|
602
|
+
instance_offer.backend.value,
|
|
603
|
+
instance_offer.region,
|
|
604
|
+
repr(e),
|
|
605
|
+
extra={"instance_name": instance.name},
|
|
606
|
+
)
|
|
607
|
+
continue
|
|
608
|
+
except Exception:
|
|
609
|
+
logger.exception(
|
|
610
|
+
"Got exception when launching %s in %s/%s",
|
|
611
|
+
instance_offer.instance.name,
|
|
612
|
+
instance_offer.backend.value,
|
|
613
|
+
instance_offer.region,
|
|
614
|
+
)
|
|
615
|
+
continue
|
|
616
|
+
|
|
617
|
+
switch_instance_status(session, instance, InstanceStatus.PROVISIONING)
|
|
618
|
+
instance.backend = backend.TYPE
|
|
619
|
+
instance.region = instance_offer.region
|
|
620
|
+
instance.price = instance_offer.price
|
|
621
|
+
instance.instance_configuration = instance_configuration.json()
|
|
622
|
+
instance.job_provisioning_data = job_provisioning_data.json()
|
|
623
|
+
instance.offer = instance_offer.json()
|
|
624
|
+
instance.total_blocks = instance_offer.total_blocks
|
|
625
|
+
instance.started_at = get_current_datetime()
|
|
626
|
+
|
|
627
|
+
if instance.fleet_id and instance.id == master_instance.id:
|
|
628
|
+
# Clean up placement groups that did not end up being used.
|
|
629
|
+
# Flush to update still uncommitted placement groups.
|
|
630
|
+
await session.flush()
|
|
631
|
+
await schedule_fleet_placement_groups_deletion(
|
|
632
|
+
session=session,
|
|
633
|
+
fleet_id=instance.fleet_id,
|
|
634
|
+
except_placement_group_ids=(
|
|
635
|
+
[placement_group_model.id] if placement_group_model is not None else []
|
|
636
|
+
),
|
|
637
|
+
)
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
instance.termination_reason = InstanceTerminationReason.NO_OFFERS
|
|
641
|
+
instance.termination_reason_message = "All offers failed" if offers else "No offers found"
|
|
642
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATED)
|
|
643
|
+
if instance.fleet and instance.id == master_instance.id and is_cloud_cluster(instance.fleet):
|
|
644
|
+
# Do not attempt to deploy other instances, as they won't determine the correct cluster
|
|
645
|
+
# backend, region, and placement group without a successfully deployed master instance
|
|
646
|
+
for sibling_instance in instance.fleet.instances:
|
|
647
|
+
if sibling_instance.id == instance.id:
|
|
648
|
+
continue
|
|
649
|
+
sibling_instance.termination_reason = InstanceTerminationReason.MASTER_FAILED
|
|
650
|
+
switch_instance_status(session, sibling_instance, InstanceStatus.TERMINATED)
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
async def _get_fleet_master_instance(
|
|
654
|
+
session: AsyncSession, instance: InstanceModel
|
|
655
|
+
) -> InstanceModel:
|
|
656
|
+
# The "master" fleet instance is relevant for cloud clusters only:
|
|
657
|
+
# it can be any fixed instance that is chosen to be provisioned first.
|
|
658
|
+
res = await session.execute(
|
|
659
|
+
select(InstanceModel)
|
|
660
|
+
.where(InstanceModel.fleet_id == instance.fleet_id)
|
|
661
|
+
.order_by(InstanceModel.instance_num, InstanceModel.created_at)
|
|
662
|
+
.limit(1)
|
|
663
|
+
)
|
|
664
|
+
return res.scalar_one()
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
async def _check_instance(session: AsyncSession, instance: InstanceModel) -> None:
|
|
668
|
+
if (
|
|
669
|
+
instance.status == InstanceStatus.BUSY
|
|
670
|
+
and instance.jobs
|
|
671
|
+
and all(job.status.is_finished() for job in instance.jobs)
|
|
672
|
+
):
|
|
673
|
+
# A busy instance could have no active jobs due to this bug: https://github.com/dstackai/dstack/issues/2068
|
|
674
|
+
instance.termination_reason = InstanceTerminationReason.JOB_FINISHED
|
|
675
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
|
|
676
|
+
logger.warning(
|
|
677
|
+
"Detected busy instance %s with finished job. Marked as TERMINATING",
|
|
678
|
+
instance.name,
|
|
679
|
+
extra={
|
|
680
|
+
"instance_name": instance.name,
|
|
681
|
+
"instance_status": instance.status.value,
|
|
682
|
+
},
|
|
683
|
+
)
|
|
684
|
+
return
|
|
685
|
+
|
|
686
|
+
job_provisioning_data = get_or_error(get_instance_provisioning_data(instance))
|
|
687
|
+
if job_provisioning_data.hostname is None:
|
|
688
|
+
res = await session.execute(
|
|
689
|
+
select(ProjectModel)
|
|
690
|
+
.where(ProjectModel.id == instance.project_id)
|
|
691
|
+
.options(joinedload(ProjectModel.backends))
|
|
692
|
+
)
|
|
693
|
+
project = res.unique().scalar_one()
|
|
694
|
+
await _wait_for_instance_provisioning_data(
|
|
695
|
+
session=session,
|
|
696
|
+
project=project,
|
|
697
|
+
instance=instance,
|
|
698
|
+
job_provisioning_data=job_provisioning_data,
|
|
699
|
+
)
|
|
700
|
+
return
|
|
701
|
+
|
|
702
|
+
if not job_provisioning_data.dockerized:
|
|
703
|
+
if instance.status == InstanceStatus.PROVISIONING:
|
|
704
|
+
switch_instance_status(session, instance, InstanceStatus.BUSY)
|
|
705
|
+
return
|
|
706
|
+
|
|
707
|
+
ssh_private_keys = get_instance_ssh_private_keys(instance)
|
|
708
|
+
|
|
709
|
+
health_check_cutoff = get_current_datetime() - timedelta(
|
|
710
|
+
seconds=server_settings.SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS
|
|
711
|
+
)
|
|
712
|
+
res = await session.execute(
|
|
713
|
+
select(func.count(1)).where(
|
|
714
|
+
InstanceHealthCheckModel.instance_id == instance.id,
|
|
715
|
+
InstanceHealthCheckModel.collected_at > health_check_cutoff,
|
|
716
|
+
)
|
|
717
|
+
)
|
|
718
|
+
check_instance_health = res.scalar_one() == 0
|
|
719
|
+
|
|
720
|
+
# May return False if fails to establish ssh connection
|
|
721
|
+
instance_check = await run_async(
|
|
722
|
+
_check_instance_inner,
|
|
723
|
+
ssh_private_keys,
|
|
724
|
+
job_provisioning_data,
|
|
725
|
+
None,
|
|
726
|
+
instance=instance,
|
|
727
|
+
check_instance_health=check_instance_health,
|
|
728
|
+
)
|
|
729
|
+
if instance_check is False:
|
|
730
|
+
instance_check = InstanceCheck(reachable=False, message="SSH or tunnel error")
|
|
731
|
+
|
|
732
|
+
if instance_check.reachable and check_instance_health:
|
|
733
|
+
health_status = instance_check.get_health_status()
|
|
734
|
+
else:
|
|
735
|
+
# Keep previous health status
|
|
736
|
+
health_status = instance.health
|
|
737
|
+
|
|
738
|
+
loglevel = logging.DEBUG
|
|
739
|
+
if not instance_check.reachable and instance.status.is_available():
|
|
740
|
+
loglevel = logging.WARNING
|
|
741
|
+
elif check_instance_health and not health_status.is_healthy():
|
|
742
|
+
loglevel = logging.WARNING
|
|
743
|
+
logger.log(
|
|
744
|
+
loglevel,
|
|
745
|
+
"Instance %s check: reachable=%s health_status=%s message=%r",
|
|
746
|
+
instance.name,
|
|
747
|
+
instance_check.reachable,
|
|
748
|
+
health_status.name,
|
|
749
|
+
instance_check.message,
|
|
750
|
+
extra={"instance_name": instance.name, "health_status": health_status},
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
if instance_check.has_health_checks():
|
|
754
|
+
# ensured by has_health_checks()
|
|
755
|
+
assert instance_check.health_response is not None
|
|
756
|
+
health_check_model = InstanceHealthCheckModel(
|
|
757
|
+
instance_id=instance.id,
|
|
758
|
+
collected_at=get_current_datetime(),
|
|
759
|
+
status=health_status,
|
|
760
|
+
response=instance_check.health_response.json(),
|
|
761
|
+
)
|
|
762
|
+
session.add(health_check_model)
|
|
763
|
+
|
|
764
|
+
_set_health(session, instance, health_status)
|
|
765
|
+
_set_unreachable(session, instance, unreachable=not instance_check.reachable)
|
|
766
|
+
|
|
767
|
+
if instance_check.reachable:
|
|
768
|
+
instance.termination_deadline = None
|
|
769
|
+
|
|
770
|
+
if instance.status == InstanceStatus.PROVISIONING:
|
|
771
|
+
switch_instance_status(
|
|
772
|
+
session,
|
|
773
|
+
instance,
|
|
774
|
+
InstanceStatus.IDLE if not instance.jobs else InstanceStatus.BUSY,
|
|
775
|
+
)
|
|
776
|
+
return
|
|
777
|
+
|
|
778
|
+
if instance.termination_deadline is None:
|
|
779
|
+
instance.termination_deadline = get_current_datetime() + TERMINATION_DEADLINE_OFFSET
|
|
780
|
+
|
|
781
|
+
if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
|
|
782
|
+
provisioning_deadline = _get_provisioning_deadline(
|
|
783
|
+
instance=instance,
|
|
784
|
+
job_provisioning_data=job_provisioning_data,
|
|
785
|
+
)
|
|
786
|
+
if get_current_datetime() > provisioning_deadline:
|
|
787
|
+
instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT
|
|
788
|
+
instance.termination_reason_message = "Instance did not become reachable in time"
|
|
789
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
|
|
790
|
+
elif instance.status.is_available():
|
|
791
|
+
deadline = instance.termination_deadline
|
|
792
|
+
if get_current_datetime() > deadline:
|
|
793
|
+
instance.termination_reason = InstanceTerminationReason.UNREACHABLE
|
|
794
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
async def _wait_for_instance_provisioning_data(
|
|
798
|
+
session: AsyncSession,
|
|
799
|
+
project: ProjectModel,
|
|
800
|
+
instance: InstanceModel,
|
|
801
|
+
job_provisioning_data: JobProvisioningData,
|
|
802
|
+
):
|
|
803
|
+
logger.debug(
|
|
804
|
+
"Waiting for instance %s to become running",
|
|
805
|
+
instance.name,
|
|
806
|
+
)
|
|
807
|
+
provisioning_deadline = _get_provisioning_deadline(
|
|
808
|
+
instance=instance,
|
|
809
|
+
job_provisioning_data=job_provisioning_data,
|
|
810
|
+
)
|
|
811
|
+
if get_current_datetime() > provisioning_deadline:
|
|
812
|
+
instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT
|
|
813
|
+
instance.termination_reason_message = "Backend did not complete provisioning in time"
|
|
814
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
|
|
815
|
+
return
|
|
816
|
+
|
|
817
|
+
backend = await backends_services.get_project_backend_by_type(
|
|
818
|
+
project=project,
|
|
819
|
+
backend_type=job_provisioning_data.backend,
|
|
820
|
+
)
|
|
821
|
+
if backend is None:
|
|
822
|
+
logger.warning(
|
|
823
|
+
"Instance %s failed because instance's backend is not available",
|
|
824
|
+
instance.name,
|
|
825
|
+
)
|
|
826
|
+
instance.termination_reason = InstanceTerminationReason.ERROR
|
|
827
|
+
instance.termination_reason_message = "Backend not available"
|
|
828
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
|
|
829
|
+
return
|
|
830
|
+
try:
|
|
831
|
+
await run_async(
|
|
832
|
+
backend.compute().update_provisioning_data,
|
|
833
|
+
job_provisioning_data,
|
|
834
|
+
project.ssh_public_key,
|
|
835
|
+
project.ssh_private_key,
|
|
836
|
+
)
|
|
837
|
+
instance.job_provisioning_data = job_provisioning_data.json()
|
|
838
|
+
except ProvisioningError as e:
|
|
839
|
+
logger.warning(
|
|
840
|
+
"Error while waiting for instance %s to become running: %s",
|
|
841
|
+
instance.name,
|
|
842
|
+
repr(e),
|
|
843
|
+
)
|
|
844
|
+
instance.termination_reason = InstanceTerminationReason.ERROR
|
|
845
|
+
instance.termination_reason_message = "Error while waiting for instance to become running"
|
|
846
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
|
|
847
|
+
except Exception:
|
|
848
|
+
logger.exception(
|
|
849
|
+
"Got exception when updating instance %s provisioning data", instance.name
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
|
|
854
|
+
def _check_instance_inner(
|
|
855
|
+
ports: Dict[int, int], *, instance: InstanceModel, check_instance_health: bool = False
|
|
856
|
+
) -> InstanceCheck:
|
|
857
|
+
instance_health_response: Optional[InstanceHealthResponse] = None
|
|
858
|
+
shim_client = runner_client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
|
|
859
|
+
method = shim_client.healthcheck
|
|
860
|
+
try:
|
|
861
|
+
healthcheck_response = method(unmask_exceptions=True)
|
|
862
|
+
if check_instance_health:
|
|
863
|
+
method = shim_client.get_instance_health
|
|
864
|
+
instance_health_response = method()
|
|
865
|
+
except requests.RequestException as e:
|
|
866
|
+
template = "shim.%s(): request error: %s"
|
|
867
|
+
args = (method.__func__.__name__, e)
|
|
868
|
+
logger.debug(template, *args)
|
|
869
|
+
return InstanceCheck(reachable=False, message=template % args)
|
|
870
|
+
except Exception as e:
|
|
871
|
+
template = "shim.%s(): unexpected exception %s: %s"
|
|
872
|
+
args = (method.__func__.__name__, e.__class__.__name__, e)
|
|
873
|
+
logger.exception(template, *args)
|
|
874
|
+
return InstanceCheck(reachable=False, message=template % args)
|
|
875
|
+
|
|
876
|
+
try:
|
|
877
|
+
remove_dangling_tasks_from_instance(shim_client, instance)
|
|
878
|
+
except Exception as e:
|
|
879
|
+
logger.exception("%s: error removing dangling tasks: %s", fmt(instance), e)
|
|
880
|
+
|
|
881
|
+
# There should be no shim API calls after this function call since it can request shim restart.
|
|
882
|
+
_maybe_install_components(instance, shim_client)
|
|
883
|
+
|
|
884
|
+
return runner_client.healthcheck_response_to_instance_check(
|
|
885
|
+
healthcheck_response, instance_health_response
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
def _maybe_install_components(
|
|
890
|
+
instance: InstanceModel, shim_client: runner_client.ShimClient
|
|
891
|
+
) -> None:
|
|
892
|
+
try:
|
|
893
|
+
components = shim_client.get_components()
|
|
894
|
+
except requests.RequestException as e:
|
|
895
|
+
logger.warning("Instance %s: shim.get_components(): request error: %s", instance.name, e)
|
|
896
|
+
return
|
|
897
|
+
if components is None:
|
|
898
|
+
logger.debug("Instance %s: no components info", instance.name)
|
|
899
|
+
return
|
|
900
|
+
|
|
901
|
+
installed_shim_version: Optional[str] = None
|
|
902
|
+
installation_requested = False
|
|
903
|
+
|
|
904
|
+
if (runner_info := components.runner) is not None:
|
|
905
|
+
installation_requested |= _maybe_install_runner(instance, shim_client, runner_info)
|
|
906
|
+
else:
|
|
907
|
+
logger.debug("Instance %s: no runner info", instance.name)
|
|
908
|
+
|
|
909
|
+
if (shim_info := components.shim) is not None:
|
|
910
|
+
if shim_info.status == ComponentStatus.INSTALLED:
|
|
911
|
+
installed_shim_version = shim_info.version
|
|
912
|
+
installation_requested |= _maybe_install_shim(instance, shim_client, shim_info)
|
|
913
|
+
else:
|
|
914
|
+
logger.debug("Instance %s: no shim info", instance.name)
|
|
915
|
+
|
|
916
|
+
running_shim_version = shim_client.get_version_string()
|
|
917
|
+
if (
|
|
918
|
+
# old shim without `dstack-shim` component and `/api/shutdown` support
|
|
919
|
+
installed_shim_version is None
|
|
920
|
+
# or the same version is already running
|
|
921
|
+
or installed_shim_version == running_shim_version
|
|
922
|
+
# or we just requested installation of at least one component
|
|
923
|
+
or installation_requested
|
|
924
|
+
# or at least one component is already being installed
|
|
925
|
+
or any(c.status == ComponentStatus.INSTALLING for c in components)
|
|
926
|
+
# or at least one shim task won't survive restart
|
|
927
|
+
or not shim_client.is_safe_to_restart()
|
|
928
|
+
):
|
|
929
|
+
return
|
|
930
|
+
|
|
931
|
+
if shim_client.shutdown(force=False):
|
|
932
|
+
logger.debug(
|
|
933
|
+
"Instance %s: restarting shim %s -> %s",
|
|
934
|
+
instance.name,
|
|
935
|
+
running_shim_version,
|
|
936
|
+
installed_shim_version,
|
|
937
|
+
)
|
|
938
|
+
else:
|
|
939
|
+
logger.debug("Instance %s: cannot restart shim", instance.name)
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def _maybe_install_runner(
|
|
943
|
+
instance: InstanceModel, shim_client: runner_client.ShimClient, runner_info: ComponentInfo
|
|
944
|
+
) -> bool:
|
|
945
|
+
# For developers:
|
|
946
|
+
# * To install the latest dev build for the current branch from the CI,
|
|
947
|
+
# set DSTACK_USE_LATEST_FROM_BRANCH=1.
|
|
948
|
+
# * To provide your own build, set DSTACK_RUNNER_VERSION_URL and DSTACK_RUNNER_DOWNLOAD_URL.
|
|
949
|
+
expected_version = get_dstack_runner_version()
|
|
950
|
+
if expected_version is None:
|
|
951
|
+
logger.debug("Cannot determine the expected runner version")
|
|
952
|
+
return False
|
|
953
|
+
|
|
954
|
+
installed_version = runner_info.version
|
|
955
|
+
logger.debug(
|
|
956
|
+
"Instance %s: runner status=%s installed_version=%s",
|
|
957
|
+
instance.name,
|
|
958
|
+
runner_info.status.value,
|
|
959
|
+
installed_version or "(no version)",
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
if runner_info.status == ComponentStatus.INSTALLING:
|
|
963
|
+
logger.debug("Instance %s: runner is already being installed", instance.name)
|
|
964
|
+
return False
|
|
965
|
+
|
|
966
|
+
if installed_version and installed_version == expected_version:
|
|
967
|
+
logger.debug("Instance %s: expected runner version already installed", instance.name)
|
|
968
|
+
return False
|
|
969
|
+
|
|
970
|
+
url = get_dstack_runner_download_url(
|
|
971
|
+
arch=_get_instance_cpu_arch(instance), version=expected_version
|
|
972
|
+
)
|
|
973
|
+
logger.debug(
|
|
974
|
+
"Instance %s: installing runner %s -> %s from %s",
|
|
975
|
+
instance.name,
|
|
976
|
+
installed_version or "(no version)",
|
|
977
|
+
expected_version,
|
|
978
|
+
url,
|
|
979
|
+
)
|
|
980
|
+
try:
|
|
981
|
+
shim_client.install_runner(url)
|
|
982
|
+
return True
|
|
983
|
+
except requests.RequestException as e:
|
|
984
|
+
logger.warning("Instance %s: shim.install_runner(): %s", instance.name, e)
|
|
985
|
+
return False
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
def _maybe_install_shim(
|
|
989
|
+
instance: InstanceModel, shim_client: runner_client.ShimClient, shim_info: ComponentInfo
|
|
990
|
+
) -> bool:
|
|
991
|
+
# For developers:
|
|
992
|
+
# * To install the latest dev build for the current branch from the CI,
|
|
993
|
+
# set DSTACK_USE_LATEST_FROM_BRANCH=1.
|
|
994
|
+
# * To provide your own build, set DSTACK_SHIM_VERSION_URL and DSTACK_SHIM_DOWNLOAD_URL.
|
|
995
|
+
expected_version = get_dstack_shim_version()
|
|
996
|
+
if expected_version is None:
|
|
997
|
+
logger.debug("Cannot determine the expected shim version")
|
|
998
|
+
return False
|
|
999
|
+
|
|
1000
|
+
installed_version = shim_info.version
|
|
1001
|
+
logger.debug(
|
|
1002
|
+
"Instance %s: shim status=%s installed_version=%s running_version=%s",
|
|
1003
|
+
instance.name,
|
|
1004
|
+
shim_info.status.value,
|
|
1005
|
+
installed_version or "(no version)",
|
|
1006
|
+
shim_client.get_version_string(),
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
if shim_info.status == ComponentStatus.INSTALLING:
|
|
1010
|
+
logger.debug("Instance %s: shim is already being installed", instance.name)
|
|
1011
|
+
return False
|
|
1012
|
+
|
|
1013
|
+
if installed_version and installed_version == expected_version:
|
|
1014
|
+
logger.debug("Instance %s: expected shim version already installed", instance.name)
|
|
1015
|
+
return False
|
|
1016
|
+
|
|
1017
|
+
url = get_dstack_shim_download_url(
|
|
1018
|
+
arch=_get_instance_cpu_arch(instance), version=expected_version
|
|
1019
|
+
)
|
|
1020
|
+
logger.debug(
|
|
1021
|
+
"Instance %s: installing shim %s -> %s from %s",
|
|
1022
|
+
instance.name,
|
|
1023
|
+
installed_version or "(no version)",
|
|
1024
|
+
expected_version,
|
|
1025
|
+
url,
|
|
1026
|
+
)
|
|
1027
|
+
try:
|
|
1028
|
+
shim_client.install_shim(url)
|
|
1029
|
+
return True
|
|
1030
|
+
except requests.RequestException as e:
|
|
1031
|
+
logger.warning("Instance %s: shim.install_shim(): %s", instance.name, e)
|
|
1032
|
+
return False
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
def _get_instance_cpu_arch(instance: InstanceModel) -> Optional[gpuhunt.CPUArchitecture]:
|
|
1036
|
+
jpd = get_instance_provisioning_data(instance)
|
|
1037
|
+
if jpd is None:
|
|
1038
|
+
return None
|
|
1039
|
+
return jpd.instance_type.resources.cpu_arch
|
|
1040
|
+
|
|
1041
|
+
|
|
1042
|
+
async def _terminate(session: AsyncSession, instance: InstanceModel) -> None:
|
|
1043
|
+
if (
|
|
1044
|
+
instance.last_termination_retry_at is not None
|
|
1045
|
+
and _next_termination_retry_at(instance) > get_current_datetime()
|
|
1046
|
+
):
|
|
1047
|
+
return
|
|
1048
|
+
jpd = get_instance_provisioning_data(instance)
|
|
1049
|
+
if jpd is not None and jpd.backend != BackendType.REMOTE:
|
|
1050
|
+
backend = await backends_services.get_project_backend_by_type(
|
|
1051
|
+
project=instance.project, backend_type=jpd.backend
|
|
1052
|
+
)
|
|
1053
|
+
if backend is None:
|
|
1054
|
+
logger.error(
|
|
1055
|
+
"Failed to terminate instance %s. Backend %s not available.",
|
|
1056
|
+
instance.name,
|
|
1057
|
+
jpd.backend,
|
|
1058
|
+
)
|
|
1059
|
+
else:
|
|
1060
|
+
logger.debug("Terminating runner instance %s", jpd.hostname)
|
|
1061
|
+
try:
|
|
1062
|
+
await run_async(
|
|
1063
|
+
backend.compute().terminate_instance,
|
|
1064
|
+
jpd.instance_id,
|
|
1065
|
+
jpd.region,
|
|
1066
|
+
jpd.backend_data,
|
|
1067
|
+
)
|
|
1068
|
+
except Exception as e:
|
|
1069
|
+
if instance.first_termination_retry_at is None:
|
|
1070
|
+
instance.first_termination_retry_at = get_current_datetime()
|
|
1071
|
+
instance.last_termination_retry_at = get_current_datetime()
|
|
1072
|
+
if _next_termination_retry_at(instance) < _get_termination_deadline(instance):
|
|
1073
|
+
if isinstance(e, NotYetTerminated):
|
|
1074
|
+
logger.debug("Instance %s termination in progress: %s", instance.name, e)
|
|
1075
|
+
else:
|
|
1076
|
+
logger.warning(
|
|
1077
|
+
"Failed to terminate instance %s. Will retry. Error: %r",
|
|
1078
|
+
instance.name,
|
|
1079
|
+
e,
|
|
1080
|
+
exc_info=not isinstance(e, BackendError),
|
|
1081
|
+
)
|
|
1082
|
+
return
|
|
1083
|
+
logger.error(
|
|
1084
|
+
"Failed all attempts to terminate instance %s."
|
|
1085
|
+
" Please terminate the instance manually to avoid unexpected charges."
|
|
1086
|
+
" Error: %r",
|
|
1087
|
+
instance.name,
|
|
1088
|
+
e,
|
|
1089
|
+
exc_info=not isinstance(e, BackendError),
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1092
|
+
instance.deleted = True
|
|
1093
|
+
instance.deleted_at = get_current_datetime()
|
|
1094
|
+
instance.finished_at = get_current_datetime()
|
|
1095
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATED)
|
|
1096
|
+
|
|
1097
|
+
|
|
1098
|
+
def _set_health(session: AsyncSession, instance: InstanceModel, health: HealthStatus) -> None:
|
|
1099
|
+
if instance.health != health:
|
|
1100
|
+
events.emit(
|
|
1101
|
+
session,
|
|
1102
|
+
f"Instance health changed {instance.health.upper()} -> {health.upper()}",
|
|
1103
|
+
actor=events.SystemActor(),
|
|
1104
|
+
targets=[events.Target.from_model(instance)],
|
|
1105
|
+
)
|
|
1106
|
+
instance.health = health
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
def _set_unreachable(session: AsyncSession, instance: InstanceModel, unreachable: bool) -> None:
|
|
1110
|
+
if (
|
|
1111
|
+
instance.status.is_available() # avoid misleading event during provisioning
|
|
1112
|
+
and instance.unreachable != unreachable
|
|
1113
|
+
):
|
|
1114
|
+
events.emit(
|
|
1115
|
+
session,
|
|
1116
|
+
"Instance became unreachable" if unreachable else "Instance became reachable",
|
|
1117
|
+
actor=events.SystemActor(),
|
|
1118
|
+
targets=[events.Target.from_model(instance)],
|
|
1119
|
+
)
|
|
1120
|
+
instance.unreachable = unreachable
|
|
1121
|
+
|
|
1122
|
+
|
|
1123
|
+
def _next_termination_retry_at(instance: InstanceModel) -> datetime.datetime:
|
|
1124
|
+
assert instance.last_termination_retry_at is not None
|
|
1125
|
+
return instance.last_termination_retry_at + TERMINATION_RETRY_TIMEOUT
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
def _get_termination_deadline(instance: InstanceModel) -> datetime.datetime:
|
|
1129
|
+
assert instance.first_termination_retry_at is not None
|
|
1130
|
+
return instance.first_termination_retry_at + TERMINATION_RETRY_MAX_DURATION
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
def _need_to_wait_fleet_provisioning(
|
|
1134
|
+
instance: InstanceModel, master_instance: InstanceModel
|
|
1135
|
+
) -> bool:
|
|
1136
|
+
# Cluster cloud instances should wait for the first fleet instance to be provisioned
|
|
1137
|
+
# so that they are provisioned in the same backend/region
|
|
1138
|
+
if instance.fleet is None:
|
|
1139
|
+
return False
|
|
1140
|
+
if (
|
|
1141
|
+
instance.id == master_instance.id
|
|
1142
|
+
or master_instance.job_provisioning_data is not None
|
|
1143
|
+
or master_instance.status == InstanceStatus.TERMINATED
|
|
1144
|
+
):
|
|
1145
|
+
return False
|
|
1146
|
+
return is_cloud_cluster(instance.fleet)
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
def _get_instance_offer_for_instance(
|
|
1150
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
1151
|
+
instance: InstanceModel,
|
|
1152
|
+
master_instance: InstanceModel,
|
|
1153
|
+
) -> InstanceOfferWithAvailability:
|
|
1154
|
+
if instance.fleet is None:
|
|
1155
|
+
return instance_offer
|
|
1156
|
+
fleet = fleet_model_to_fleet(instance.fleet)
|
|
1157
|
+
if fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER:
|
|
1158
|
+
master_job_provisioning_data = get_instance_provisioning_data(master_instance)
|
|
1159
|
+
return get_instance_offer_with_restricted_az(
|
|
1160
|
+
instance_offer=instance_offer,
|
|
1161
|
+
master_job_provisioning_data=master_job_provisioning_data,
|
|
1162
|
+
)
|
|
1163
|
+
return instance_offer
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
def _get_instance_idle_duration(instance: InstanceModel) -> datetime.timedelta:
|
|
1167
|
+
last_time = instance.created_at
|
|
1168
|
+
if instance.last_job_processed_at is not None:
|
|
1169
|
+
last_time = instance.last_job_processed_at
|
|
1170
|
+
return get_current_datetime() - last_time
|
|
1171
|
+
|
|
1172
|
+
|
|
1173
|
+
def _get_provisioning_deadline(
|
|
1174
|
+
instance: InstanceModel,
|
|
1175
|
+
job_provisioning_data: JobProvisioningData,
|
|
1176
|
+
) -> datetime.datetime:
|
|
1177
|
+
assert instance.started_at is not None
|
|
1178
|
+
timeout_interval = get_provisioning_timeout(
|
|
1179
|
+
backend_type=job_provisioning_data.get_base_backend(),
|
|
1180
|
+
instance_type_name=job_provisioning_data.instance_type.name,
|
|
1181
|
+
)
|
|
1182
|
+
return instance.started_at + timeout_interval
|
|
1183
|
+
|
|
1184
|
+
|
|
1185
|
+
def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
|
|
1186
|
+
return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]
|