dstack 0.0.9__py3-none-any.whl → 0.20.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/__init__.py +80 -0
- dstack/_internal/cli/commands/apply.py +100 -0
- dstack/_internal/cli/commands/attach.py +161 -0
- dstack/_internal/cli/commands/completion.py +22 -0
- dstack/_internal/cli/commands/delete.py +44 -0
- dstack/_internal/cli/commands/event.py +168 -0
- dstack/_internal/cli/commands/fleet.py +161 -0
- dstack/_internal/cli/commands/gateway.py +159 -0
- dstack/_internal/cli/commands/init.py +64 -0
- dstack/_internal/cli/commands/login.py +352 -0
- dstack/_internal/cli/commands/logs.py +62 -0
- dstack/_internal/cli/commands/metrics.py +153 -0
- dstack/_internal/cli/commands/offer.py +146 -0
- dstack/_internal/cli/commands/project.py +259 -0
- dstack/_internal/cli/commands/ps.py +81 -0
- dstack/_internal/cli/commands/run.py +69 -0
- dstack/_internal/cli/commands/secrets.py +92 -0
- dstack/_internal/cli/commands/server.py +96 -0
- dstack/_internal/cli/commands/stop.py +26 -0
- dstack/_internal/cli/commands/volume.py +117 -0
- dstack/_internal/cli/main.py +101 -0
- dstack/_internal/cli/models/gateways.py +16 -0
- dstack/_internal/cli/models/offers.py +47 -0
- dstack/_internal/cli/models/runs.py +16 -0
- dstack/_internal/cli/services/args.py +31 -0
- dstack/_internal/cli/services/completion.py +91 -0
- dstack/_internal/cli/services/configurators/__init__.py +86 -0
- dstack/_internal/cli/services/configurators/base.py +103 -0
- dstack/_internal/cli/services/configurators/fleet.py +475 -0
- dstack/_internal/cli/services/configurators/gateway.py +231 -0
- dstack/_internal/cli/services/configurators/run.py +882 -0
- dstack/_internal/cli/services/configurators/volume.py +222 -0
- dstack/_internal/cli/services/events.py +68 -0
- dstack/_internal/cli/services/profile.py +182 -0
- dstack/_internal/cli/services/repos.py +71 -0
- dstack/_internal/cli/services/resources.py +54 -0
- dstack/_internal/cli/utils/common.py +159 -0
- dstack/_internal/cli/utils/fleet.py +106 -0
- dstack/_internal/cli/utils/gateway.py +56 -0
- dstack/_internal/cli/utils/gpu.py +178 -0
- dstack/_internal/cli/utils/rich.py +156 -0
- dstack/_internal/cli/utils/run.py +517 -0
- dstack/_internal/cli/utils/secrets.py +25 -0
- dstack/_internal/cli/utils/updates.py +98 -0
- dstack/_internal/cli/utils/volume.py +58 -0
- dstack/_internal/compat.py +3 -0
- dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
- dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
- dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
- dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
- dstack/_internal/core/backends/aws/auth.py +30 -0
- dstack/_internal/core/backends/aws/backend.py +31 -0
- dstack/_internal/core/backends/aws/compute.py +1153 -0
- dstack/_internal/core/backends/aws/configurator.py +191 -0
- dstack/_internal/core/backends/aws/models.py +135 -0
- dstack/_internal/core/backends/aws/resources.py +700 -0
- dstack/_internal/core/backends/azure/auth.py +39 -0
- dstack/_internal/core/backends/azure/backend.py +21 -0
- dstack/_internal/core/backends/azure/compute.py +676 -0
- dstack/_internal/core/backends/azure/configurator.py +472 -0
- dstack/_internal/core/backends/azure/models.py +98 -0
- dstack/_internal/core/backends/azure/resources.py +116 -0
- dstack/_internal/core/backends/azure/utils.py +42 -0
- dstack/_internal/core/backends/base/backend.py +18 -0
- dstack/_internal/core/backends/base/compute.py +1101 -0
- dstack/_internal/core/backends/base/configurator.py +117 -0
- dstack/_internal/core/backends/base/models.py +24 -0
- dstack/_internal/core/backends/base/offers.py +232 -0
- dstack/_internal/core/backends/cloudrift/api_client.py +220 -0
- dstack/_internal/core/backends/cloudrift/backend.py +16 -0
- dstack/_internal/core/backends/cloudrift/compute.py +138 -0
- dstack/_internal/core/backends/cloudrift/configurator.py +72 -0
- dstack/_internal/core/backends/cloudrift/models.py +40 -0
- dstack/_internal/core/backends/configurators.py +181 -0
- dstack/_internal/core/backends/cudo/__init__.py +0 -0
- dstack/_internal/core/backends/cudo/api_client.py +111 -0
- dstack/_internal/core/backends/cudo/backend.py +16 -0
- dstack/_internal/core/backends/cudo/compute.py +174 -0
- dstack/_internal/core/backends/cudo/configurator.py +63 -0
- dstack/_internal/core/backends/cudo/models.py +37 -0
- dstack/_internal/core/backends/datacrunch/__init__.py +1 -0
- dstack/_internal/core/backends/datacrunch/backend.py +18 -0
- dstack/_internal/core/backends/datacrunch/compute.py +8 -0
- dstack/_internal/core/backends/datacrunch/configurator.py +17 -0
- dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean/backend.py +16 -0
- dstack/_internal/core/backends/digitalocean/compute.py +5 -0
- dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
- dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
- dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
- dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +174 -0
- dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
- dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
- dstack/_internal/core/backends/dstack/__init__.py +0 -0
- dstack/_internal/core/backends/dstack/models.py +26 -0
- dstack/_internal/core/backends/features.py +74 -0
- dstack/_internal/core/backends/gcp/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/auth.py +57 -0
- dstack/_internal/core/backends/gcp/backend.py +17 -0
- dstack/_internal/core/backends/gcp/compute.py +1257 -0
- dstack/_internal/core/backends/gcp/configurator.py +206 -0
- dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/features/tcpx.py +65 -0
- dstack/_internal/core/backends/gcp/models.py +160 -0
- dstack/_internal/core/backends/gcp/resources.py +585 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +101 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +188 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +66 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/kubernetes/__init__.py +0 -0
- dstack/_internal/core/backends/kubernetes/backend.py +16 -0
- dstack/_internal/core/backends/kubernetes/compute.py +1077 -0
- dstack/_internal/core/backends/kubernetes/configurator.py +61 -0
- dstack/_internal/core/backends/kubernetes/models.py +71 -0
- dstack/_internal/core/backends/kubernetes/utils.py +81 -0
- dstack/_internal/core/backends/lambdalabs/__init__.py +0 -0
- dstack/_internal/core/backends/lambdalabs/api_client.py +87 -0
- dstack/_internal/core/backends/lambdalabs/backend.py +17 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +233 -0
- dstack/_internal/core/backends/lambdalabs/configurator.py +65 -0
- dstack/_internal/core/backends/lambdalabs/models.py +37 -0
- dstack/_internal/core/backends/local/__init__.py +0 -0
- dstack/_internal/core/backends/local/backend.py +14 -0
- dstack/_internal/core/backends/local/compute.py +130 -0
- dstack/_internal/core/backends/models.py +158 -0
- dstack/_internal/core/backends/nebius/__init__.py +0 -0
- dstack/_internal/core/backends/nebius/backend.py +16 -0
- dstack/_internal/core/backends/nebius/compute.py +401 -0
- dstack/_internal/core/backends/nebius/configurator.py +98 -0
- dstack/_internal/core/backends/nebius/models.py +185 -0
- dstack/_internal/core/backends/nebius/resources.py +433 -0
- dstack/_internal/core/backends/oci/__init__.py +0 -0
- dstack/_internal/core/backends/oci/auth.py +21 -0
- dstack/_internal/core/backends/oci/backend.py +16 -0
- dstack/_internal/core/backends/oci/compute.py +209 -0
- dstack/_internal/core/backends/oci/configurator.py +156 -0
- dstack/_internal/core/backends/oci/exceptions.py +15 -0
- dstack/_internal/core/backends/oci/models.py +87 -0
- dstack/_internal/core/backends/oci/region.py +86 -0
- dstack/_internal/core/backends/oci/resources.py +836 -0
- dstack/_internal/core/backends/runpod/__init__.py +0 -0
- dstack/_internal/core/backends/runpod/api_client.py +627 -0
- dstack/_internal/core/backends/runpod/backend.py +16 -0
- dstack/_internal/core/backends/runpod/compute.py +444 -0
- dstack/_internal/core/backends/runpod/configurator.py +63 -0
- dstack/_internal/core/backends/runpod/models.py +54 -0
- dstack/_internal/core/backends/template/__init__.py +0 -0
- dstack/_internal/core/backends/template/backend.py.jinja +16 -0
- dstack/_internal/core/backends/template/compute.py.jinja +95 -0
- dstack/_internal/core/backends/template/configurator.py.jinja +69 -0
- dstack/_internal/core/backends/template/models.py.jinja +62 -0
- dstack/_internal/core/backends/tensordock/models.py +40 -0
- dstack/_internal/core/backends/vastai/__init__.py +0 -0
- dstack/_internal/core/backends/vastai/api_client.py +143 -0
- dstack/_internal/core/backends/vastai/backend.py +16 -0
- dstack/_internal/core/backends/vastai/compute.py +141 -0
- dstack/_internal/core/backends/vastai/configurator.py +69 -0
- dstack/_internal/core/backends/vastai/models.py +37 -0
- dstack/_internal/core/backends/verda/__init__.py +0 -0
- dstack/_internal/core/backends/verda/backend.py +16 -0
- dstack/_internal/core/backends/verda/compute.py +266 -0
- dstack/_internal/core/backends/verda/configurator.py +73 -0
- dstack/_internal/core/backends/verda/models.py +38 -0
- dstack/_internal/core/backends/vultr/__init__.py +0 -0
- dstack/_internal/core/backends/vultr/api_client.py +116 -0
- dstack/_internal/core/backends/vultr/backend.py +16 -0
- dstack/_internal/core/backends/vultr/compute.py +167 -0
- dstack/_internal/core/backends/vultr/configurator.py +71 -0
- dstack/_internal/core/backends/vultr/models.py +34 -0
- dstack/_internal/core/compatibility/__init__.py +0 -0
- dstack/_internal/core/compatibility/events.py +13 -0
- dstack/_internal/core/compatibility/fleets.py +58 -0
- dstack/_internal/core/compatibility/gateways.py +39 -0
- dstack/_internal/core/compatibility/gpus.py +13 -0
- dstack/_internal/core/compatibility/logs.py +14 -0
- dstack/_internal/core/compatibility/runs.py +86 -0
- dstack/_internal/core/compatibility/volumes.py +37 -0
- dstack/_internal/core/consts.py +8 -0
- dstack/_internal/core/errors.py +160 -0
- dstack/_internal/core/models/__init__.py +0 -0
- dstack/_internal/core/models/auth.py +28 -0
- dstack/_internal/core/models/backends/__init__.py +0 -0
- dstack/_internal/core/models/backends/base.py +48 -0
- dstack/_internal/core/models/common.py +143 -0
- dstack/_internal/core/models/compute_groups.py +39 -0
- dstack/_internal/core/models/config.py +28 -0
- dstack/_internal/core/models/configurations.py +1123 -0
- dstack/_internal/core/models/envs.py +149 -0
- dstack/_internal/core/models/events.py +98 -0
- dstack/_internal/core/models/files.py +67 -0
- dstack/_internal/core/models/fleets.py +437 -0
- dstack/_internal/core/models/gateways.py +146 -0
- dstack/_internal/core/models/gpus.py +45 -0
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +346 -0
- dstack/_internal/core/models/logs.py +27 -0
- dstack/_internal/core/models/metrics.py +14 -0
- dstack/_internal/core/models/placement.py +27 -0
- dstack/_internal/core/models/profiles.py +431 -0
- dstack/_internal/core/models/projects.py +46 -0
- dstack/_internal/core/models/repos/__init__.py +34 -0
- dstack/_internal/core/models/repos/base.py +36 -0
- dstack/_internal/core/models/repos/local.py +96 -0
- dstack/_internal/core/models/repos/remote.py +341 -0
- dstack/_internal/core/models/repos/virtual.py +85 -0
- dstack/_internal/core/models/resources.py +424 -0
- dstack/_internal/core/models/routers.py +24 -0
- dstack/_internal/core/models/runs.py +618 -0
- dstack/_internal/core/models/secrets.py +16 -0
- dstack/_internal/core/models/server.py +7 -0
- dstack/_internal/core/models/services.py +76 -0
- dstack/_internal/core/models/unix.py +53 -0
- dstack/_internal/core/models/users.py +60 -0
- dstack/_internal/core/models/volumes.py +221 -0
- dstack/_internal/core/services/__init__.py +16 -0
- dstack/_internal/core/services/api_client.py +15 -0
- dstack/_internal/core/services/configs/__init__.py +116 -0
- dstack/_internal/core/services/diff.py +71 -0
- dstack/_internal/core/services/logs.py +58 -0
- dstack/_internal/core/services/profiles.py +46 -0
- dstack/_internal/core/services/repos.py +236 -0
- dstack/_internal/core/services/ssh/__init__.py +27 -0
- dstack/_internal/core/services/ssh/attach.py +241 -0
- dstack/_internal/core/services/ssh/client.py +113 -0
- dstack/_internal/core/services/ssh/key_manager.py +53 -0
- dstack/_internal/core/services/ssh/ports.py +89 -0
- dstack/_internal/core/services/ssh/tunnel.py +337 -0
- dstack/_internal/proxy/__init__.py +8 -0
- dstack/_internal/proxy/gateway/__init__.py +0 -0
- dstack/_internal/proxy/gateway/app.py +89 -0
- dstack/_internal/proxy/gateway/auth.py +26 -0
- dstack/_internal/proxy/gateway/const.py +7 -0
- dstack/_internal/proxy/gateway/deps.py +73 -0
- dstack/_internal/proxy/gateway/main.py +17 -0
- dstack/_internal/proxy/gateway/models.py +23 -0
- dstack/_internal/proxy/gateway/repo/__init__.py +0 -0
- dstack/_internal/proxy/gateway/repo/repo.py +121 -0
- dstack/_internal/proxy/gateway/repo/state_v1.py +164 -0
- dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -0
- dstack/_internal/proxy/gateway/resources/nginx/entrypoint.jinja2 +27 -0
- dstack/_internal/proxy/gateway/resources/nginx/router_workers.jinja2 +23 -0
- dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +105 -0
- dstack/_internal/proxy/gateway/routers/__init__.py +0 -0
- dstack/_internal/proxy/gateway/routers/auth.py +10 -0
- dstack/_internal/proxy/gateway/routers/config.py +28 -0
- dstack/_internal/proxy/gateway/routers/registry.py +124 -0
- dstack/_internal/proxy/gateway/routers/stats.py +18 -0
- dstack/_internal/proxy/gateway/schemas/__init__.py +0 -0
- dstack/_internal/proxy/gateway/schemas/common.py +5 -0
- dstack/_internal/proxy/gateway/schemas/config.py +9 -0
- dstack/_internal/proxy/gateway/schemas/registry.py +63 -0
- dstack/_internal/proxy/gateway/schemas/stats.py +15 -0
- dstack/_internal/proxy/gateway/services/__init__.py +0 -0
- dstack/_internal/proxy/gateway/services/model_routers/__init__.py +18 -0
- dstack/_internal/proxy/gateway/services/model_routers/base.py +91 -0
- dstack/_internal/proxy/gateway/services/model_routers/sglang.py +269 -0
- dstack/_internal/proxy/gateway/services/nginx.py +455 -0
- dstack/_internal/proxy/gateway/services/registry.py +426 -0
- dstack/_internal/proxy/gateway/services/server_client.py +95 -0
- dstack/_internal/proxy/gateway/services/stats.py +170 -0
- dstack/_internal/proxy/gateway/testing/__init__.py +0 -0
- dstack/_internal/proxy/gateway/testing/common.py +13 -0
- dstack/_internal/proxy/lib/__init__.py +0 -0
- dstack/_internal/proxy/lib/auth.py +7 -0
- dstack/_internal/proxy/lib/deps.py +106 -0
- dstack/_internal/proxy/lib/errors.py +14 -0
- dstack/_internal/proxy/lib/models.py +112 -0
- dstack/_internal/proxy/lib/repo.py +27 -0
- dstack/_internal/proxy/lib/routers/__init__.py +0 -0
- dstack/_internal/proxy/lib/routers/model_proxy.py +102 -0
- dstack/_internal/proxy/lib/schemas/__init__.py +0 -0
- dstack/_internal/proxy/lib/schemas/model_proxy.py +77 -0
- dstack/_internal/proxy/lib/services/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/__init__.py +0 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/base.py +18 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/openai.py +67 -0
- dstack/_internal/proxy/lib/services/model_proxy/clients/tgi.py +208 -0
- dstack/_internal/proxy/lib/services/model_proxy/model_proxy.py +23 -0
- dstack/_internal/proxy/lib/services/service_connection.py +160 -0
- dstack/_internal/proxy/lib/testing/__init__.py +0 -0
- dstack/_internal/proxy/lib/testing/auth.py +11 -0
- dstack/_internal/proxy/lib/testing/common.py +51 -0
- dstack/_internal/server/__init__.py +0 -0
- dstack/_internal/server/alembic.ini +100 -0
- dstack/_internal/server/app.py +432 -0
- dstack/_internal/server/background/__init__.py +142 -0
- dstack/_internal/server/background/tasks/__init__.py +0 -0
- dstack/_internal/server/background/tasks/common.py +24 -0
- dstack/_internal/server/background/tasks/process_compute_groups.py +167 -0
- dstack/_internal/server/background/tasks/process_events.py +17 -0
- dstack/_internal/server/background/tasks/process_fleets.py +289 -0
- dstack/_internal/server/background/tasks/process_gateways.py +188 -0
- dstack/_internal/server/background/tasks/process_idle_volumes.py +145 -0
- dstack/_internal/server/background/tasks/process_instances.py +1186 -0
- dstack/_internal/server/background/tasks/process_metrics.py +172 -0
- dstack/_internal/server/background/tasks/process_placement_groups.py +104 -0
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +150 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +1238 -0
- dstack/_internal/server/background/tasks/process_runs.py +842 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +1106 -0
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +108 -0
- dstack/_internal/server/background/tasks/process_volumes.py +129 -0
- dstack/_internal/server/compatibility/__init__.py +0 -0
- dstack/_internal/server/compatibility/common.py +20 -0
- dstack/_internal/server/compatibility/gpus.py +22 -0
- dstack/_internal/server/db.py +127 -0
- dstack/_internal/server/deps.py +19 -0
- dstack/_internal/server/main.py +4 -0
- dstack/_internal/server/migrations/__init__.py +0 -0
- dstack/_internal/server/migrations/env.py +112 -0
- dstack/_internal/server/migrations/script.py.mako +28 -0
- dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py +38 -0
- dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py +81 -0
- dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py +45 -0
- dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py +64 -0
- dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py +50 -0
- dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
- dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py +44 -0
- dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py +42 -0
- dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py +31 -0
- dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py +26 -0
- dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py +99 -0
- dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py +32 -0
- dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
- dstack/_internal/server/migrations/versions/252d3743b641_.py +40 -0
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py +152 -0
- dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py +34 -0
- dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py +36 -0
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
- dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
- dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py +85 -0
- dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
- dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py +33 -0
- dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py +46 -0
- dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py +34 -0
- dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py +144 -0
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
- dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py +67 -0
- dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py +61 -0
- dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py +32 -0
- dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py +329 -0
- dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py +31 -0
- dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
- dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py +31 -0
- dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py +40 -0
- dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
- dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
- dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py +32 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py +26 -0
- dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py +54 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
- dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py +68 -0
- dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py +35 -0
- dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py +36 -0
- dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +91 -0
- dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py +32 -0
- dstack/_internal/server/migrations/versions/82b32a135ea2_.py +58 -0
- dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py +93 -0
- dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py +34 -0
- dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py +43 -0
- dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py +36 -0
- dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py +73 -0
- dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py +140 -0
- dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py +71 -0
- dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py +36 -0
- dstack/_internal/server/migrations/versions/__init__.py +0 -0
- dstack/_internal/server/migrations/versions/a060e2440936_.py +206 -0
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py +58 -0
- dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py +102 -0
- dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py +37 -0
- dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py +21 -0
- dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py +171 -0
- dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
- dstack/_internal/server/migrations/versions/bfba43f6def2_.py +32 -0
- dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py +108 -0
- dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py +74 -0
- dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py +43 -0
- dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py +38 -0
- dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py +150 -0
- dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py +106 -0
- dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py +34 -0
- dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py +31 -0
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py +36 -0
- dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py +40 -0
- dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py +36 -0
- dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
- dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py +61 -0
- dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py +72 -0
- dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py +32 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py +29 -0
- dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py +81 -0
- dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
- dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py +81 -0
- dstack/_internal/server/models.py +930 -0
- dstack/_internal/server/routers/__init__.py +0 -0
- dstack/_internal/server/routers/auth.py +34 -0
- dstack/_internal/server/routers/backends.py +142 -0
- dstack/_internal/server/routers/events.py +60 -0
- dstack/_internal/server/routers/files.py +68 -0
- dstack/_internal/server/routers/fleets.py +202 -0
- dstack/_internal/server/routers/gateways.py +109 -0
- dstack/_internal/server/routers/gpus.py +32 -0
- dstack/_internal/server/routers/instances.py +77 -0
- dstack/_internal/server/routers/logs.py +34 -0
- dstack/_internal/server/routers/metrics.py +82 -0
- dstack/_internal/server/routers/projects.py +205 -0
- dstack/_internal/server/routers/prometheus.py +35 -0
- dstack/_internal/server/routers/repos.py +118 -0
- dstack/_internal/server/routers/runs.py +216 -0
- dstack/_internal/server/routers/secrets.py +86 -0
- dstack/_internal/server/routers/server.py +19 -0
- dstack/_internal/server/routers/users.py +158 -0
- dstack/_internal/server/routers/volumes.py +122 -0
- dstack/_internal/server/schemas/__init__.py +0 -0
- dstack/_internal/server/schemas/auth.py +83 -0
- dstack/_internal/server/schemas/backends.py +16 -0
- dstack/_internal/server/schemas/common.py +9 -0
- dstack/_internal/server/schemas/events.py +211 -0
- dstack/_internal/server/schemas/files.py +5 -0
- dstack/_internal/server/schemas/fleets.py +49 -0
- dstack/_internal/server/schemas/gateways.py +31 -0
- dstack/_internal/server/schemas/gpus.py +26 -0
- dstack/_internal/server/schemas/health/__init__.py +0 -0
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +47 -0
- dstack/_internal/server/schemas/logs.py +17 -0
- dstack/_internal/server/schemas/projects.py +81 -0
- dstack/_internal/server/schemas/repos.py +24 -0
- dstack/_internal/server/schemas/runner.py +269 -0
- dstack/_internal/server/schemas/runs.py +66 -0
- dstack/_internal/server/schemas/secrets.py +16 -0
- dstack/_internal/server/schemas/users.py +72 -0
- dstack/_internal/server/schemas/volumes.py +29 -0
- dstack/_internal/server/security/__init__.py +0 -0
- dstack/_internal/server/security/permissions.py +251 -0
- dstack/_internal/server/services/__init__.py +0 -0
- dstack/_internal/server/services/auth.py +77 -0
- dstack/_internal/server/services/backends/__init__.py +404 -0
- dstack/_internal/server/services/backends/handlers.py +105 -0
- dstack/_internal/server/services/compute_groups.py +22 -0
- dstack/_internal/server/services/config.py +279 -0
- dstack/_internal/server/services/docker.py +162 -0
- dstack/_internal/server/services/encryption/__init__.py +102 -0
- dstack/_internal/server/services/encryption/keys/__init__.py +0 -0
- dstack/_internal/server/services/encryption/keys/aes.py +68 -0
- dstack/_internal/server/services/encryption/keys/base.py +19 -0
- dstack/_internal/server/services/encryption/keys/identity.py +28 -0
- dstack/_internal/server/services/events.py +477 -0
- dstack/_internal/server/services/files.py +91 -0
- dstack/_internal/server/services/fleets.py +1224 -0
- dstack/_internal/server/services/gateways/__init__.py +686 -0
- dstack/_internal/server/services/gateways/client.py +209 -0
- dstack/_internal/server/services/gateways/connection.py +139 -0
- dstack/_internal/server/services/gateways/pool.py +58 -0
- dstack/_internal/server/services/gpus.py +387 -0
- dstack/_internal/server/services/instances.py +731 -0
- dstack/_internal/server/services/jobs/__init__.py +840 -0
- dstack/_internal/server/services/jobs/configurators/__init__.py +0 -0
- dstack/_internal/server/services/jobs/configurators/base.py +469 -0
- dstack/_internal/server/services/jobs/configurators/dev.py +69 -0
- dstack/_internal/server/services/jobs/configurators/extensions/__init__.py +0 -0
- dstack/_internal/server/services/jobs/configurators/extensions/base.py +15 -0
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +42 -0
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +42 -0
- dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py +43 -0
- dstack/_internal/server/services/jobs/configurators/service.py +28 -0
- dstack/_internal/server/services/jobs/configurators/task.py +39 -0
- dstack/_internal/server/services/locking.py +187 -0
- dstack/_internal/server/services/logging.py +29 -0
- dstack/_internal/server/services/logs/__init__.py +122 -0
- dstack/_internal/server/services/logs/aws.py +373 -0
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +261 -0
- dstack/_internal/server/services/logs/fluentbit.py +329 -0
- dstack/_internal/server/services/logs/gcp.py +181 -0
- dstack/_internal/server/services/metrics.py +172 -0
- dstack/_internal/server/services/offers.py +249 -0
- dstack/_internal/server/services/permissions.py +37 -0
- dstack/_internal/server/services/placement.py +234 -0
- dstack/_internal/server/services/plugins.py +109 -0
- dstack/_internal/server/services/probes.py +10 -0
- dstack/_internal/server/services/projects.py +835 -0
- dstack/_internal/server/services/prometheus/__init__.py +0 -0
- dstack/_internal/server/services/prometheus/client_metrics.py +55 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +327 -0
- dstack/_internal/server/services/proxy/__init__.py +3 -0
- dstack/_internal/server/services/proxy/auth.py +12 -0
- dstack/_internal/server/services/proxy/deps.py +18 -0
- dstack/_internal/server/services/proxy/repo.py +189 -0
- dstack/_internal/server/services/proxy/routers/__init__.py +0 -0
- dstack/_internal/server/services/proxy/routers/service_proxy.py +49 -0
- dstack/_internal/server/services/proxy/services/__init__.py +0 -0
- dstack/_internal/server/services/proxy/services/service_proxy.py +135 -0
- dstack/_internal/server/services/repos.py +362 -0
- dstack/_internal/server/services/requirements/__init__.py +0 -0
- dstack/_internal/server/services/requirements/combine.py +260 -0
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runner/__init__.py +0 -0
- dstack/_internal/server/services/runner/client.py +646 -0
- dstack/_internal/server/services/runner/ssh.py +128 -0
- dstack/_internal/server/services/runs/__init__.py +1026 -0
- dstack/_internal/server/services/runs/plan.py +703 -0
- dstack/_internal/server/services/runs/replicas.py +317 -0
- dstack/_internal/server/services/runs/spec.py +191 -0
- dstack/_internal/server/services/secrets.py +245 -0
- dstack/_internal/server/services/services/__init__.py +345 -0
- dstack/_internal/server/services/services/autoscalers.py +140 -0
- dstack/_internal/server/services/services/options.py +53 -0
- dstack/_internal/server/services/ssh.py +67 -0
- dstack/_internal/server/services/storage/__init__.py +37 -0
- dstack/_internal/server/services/storage/base.py +48 -0
- dstack/_internal/server/services/storage/gcs.py +66 -0
- dstack/_internal/server/services/storage/s3.py +69 -0
- dstack/_internal/server/services/users.py +461 -0
- dstack/_internal/server/services/volumes.py +496 -0
- dstack/_internal/server/settings.py +161 -0
- dstack/_internal/server/statics/00a6e1fb461ed2929fb9.png +0 -0
- dstack/_internal/server/statics/0cae4d9f0a36034984a7.png +0 -0
- dstack/_internal/server/statics/391de232cc0e30cae513.png +0 -0
- dstack/_internal/server/statics/4e0eead8c1a73689ef9d.svg +1 -0
- dstack/_internal/server/statics/544afa2f63428c2235b0.png +0 -0
- dstack/_internal/server/statics/54a4f50f74c6b9381530.svg +7 -0
- dstack/_internal/server/statics/68dd1360a7d2611e0132.svg +4 -0
- dstack/_internal/server/statics/69544b4c81973b54a66f.png +0 -0
- dstack/_internal/server/statics/77a8b02b17af19e39266.png +0 -0
- dstack/_internal/server/statics/83a93a8871c219104367.svg +9 -0
- dstack/_internal/server/statics/8f28bb8e9999e5e6a48b.svg +4 -0
- dstack/_internal/server/statics/9124086961ab8c366bc4.svg +9 -0
- dstack/_internal/server/statics/9a9ebaeb54b025dbac0a.svg +5 -0
- dstack/_internal/server/statics/a3428392dc534f3b15c4.svg +7 -0
- dstack/_internal/server/statics/ae22625574d69361f72c.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-144x144.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-192x192.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-256x256.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-36x36.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-384x384.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-48x48.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-512x512.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-72x72.png +0 -0
- dstack/_internal/server/statics/assets/android-chrome-96x96.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-1024x1024.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-114x114.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-120x120.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-144x144.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-152x152.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-167x167.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-180x180.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-57x57.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-60x60.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-72x72.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-76x76.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon-precomposed.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-icon.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1170x2532.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1179x2556.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1284x2778.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1290x2796.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1488x2266.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1640x2160.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2160x1640.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2266x1488.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2532x1170.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2556x1179.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2778x1284.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-2796x1290.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
- dstack/_internal/server/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
- dstack/_internal/server/statics/assets/browserconfig.xml +12 -0
- dstack/_internal/server/statics/assets/favicon-16x16.png +0 -0
- dstack/_internal/server/statics/assets/favicon-32x32.png +0 -0
- dstack/_internal/server/statics/assets/favicon-48x48.png +0 -0
- dstack/_internal/server/statics/assets/favicon.ico +0 -0
- dstack/{dashboard/statics/assets/manifest.json → _internal/server/statics/assets/manifest.webmanifest} +18 -9
- dstack/_internal/server/statics/assets/mstile-144x144.png +0 -0
- dstack/_internal/server/statics/assets/mstile-150x150.png +0 -0
- dstack/_internal/server/statics/assets/mstile-310x150.png +0 -0
- dstack/_internal/server/statics/assets/mstile-310x310.png +0 -0
- dstack/_internal/server/statics/assets/mstile-70x70.png +0 -0
- dstack/_internal/server/statics/assets/yandex-browser-50x50.png +0 -0
- dstack/_internal/server/statics/b7ae68f44193474fc578.png +0 -0
- dstack/_internal/server/statics/d2f008c75b2b5b191f3f.png +0 -0
- dstack/_internal/server/statics/d44c33e1b92e05c379fd.png +0 -0
- dstack/_internal/server/statics/dd43ff0552815179d7ab.png +0 -0
- dstack/_internal/server/statics/dd4e7166c0b9aac197d7.png +0 -0
- dstack/_internal/server/statics/e30b27916930d43d2271.png +0 -0
- dstack/_internal/server/statics/e467d7d60aae81ab198b.svg +6 -0
- dstack/_internal/server/statics/eb9b344b73818fe2b71a.png +0 -0
- dstack/_internal/server/statics/f517dd626eb964120de0.png +0 -0
- dstack/_internal/server/statics/f958aecddee5d8e3222c.png +0 -0
- dstack/_internal/server/statics/index.html +3 -0
- dstack/_internal/server/statics/logo-notext.svg +116 -0
- dstack/_internal/server/statics/main-2e6967bad9f29395eea6.css +3 -0
- dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js +155547 -0
- dstack/_internal/server/statics/main-7dc0f6d20b8b41659acc.js.map +1 -0
- dstack/{dashboard → _internal/server}/statics/manifest.json +2 -2
- dstack/_internal/server/statics/static/media/entraID.d65d1f3e9486a8e56d24fc07b3230885.svg +9 -0
- dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
- dstack/{dashboard/statics/static/media/logo.f9d7170678f68f796e270698633770ec.svg → _internal/server/statics/static/media/logo.f602feeb138844eda97c8cb641461448.svg} +8 -6
- dstack/_internal/server/statics/static/media/okta.12f178e6873a1100965f2a4dbd18fcec.svg +2 -0
- dstack/_internal/server/statics/static/media/theme.3994c817bb7dda191c1c9640dee0bf42.svg +3 -0
- dstack/_internal/server/testing/__init__.py +0 -0
- dstack/_internal/server/testing/common.py +1220 -0
- dstack/_internal/server/testing/conf.py +53 -0
- dstack/_internal/server/testing/matchers.py +31 -0
- dstack/_internal/server/utils/__init__.py +0 -0
- dstack/_internal/server/utils/common.py +55 -0
- dstack/_internal/server/utils/logging.py +51 -0
- dstack/_internal/server/utils/provisioning.py +368 -0
- dstack/_internal/server/utils/routers.py +166 -0
- dstack/_internal/server/utils/sentry_utils.py +24 -0
- dstack/_internal/settings.py +49 -0
- dstack/_internal/utils/__init__.py +0 -0
- dstack/_internal/utils/common.py +318 -0
- dstack/_internal/utils/cron.py +5 -0
- dstack/_internal/utils/crypto.py +40 -0
- dstack/_internal/utils/env.py +88 -0
- dstack/_internal/utils/event_loop.py +30 -0
- dstack/_internal/utils/files.py +69 -0
- dstack/_internal/utils/gpu.py +59 -0
- dstack/_internal/utils/hash.py +31 -0
- dstack/_internal/utils/interpolator.py +91 -0
- dstack/_internal/utils/json_schema.py +11 -0
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/_internal/utils/logging.py +5 -0
- dstack/_internal/utils/nested_list.py +47 -0
- dstack/_internal/utils/network.py +50 -0
- dstack/_internal/utils/path.py +57 -0
- dstack/_internal/utils/random_names.py +258 -0
- dstack/_internal/utils/ssh.py +346 -0
- dstack/_internal/utils/tags.py +42 -0
- dstack/_internal/utils/typing.py +14 -0
- dstack/_internal/utils/version.py +22 -0
- dstack/api/__init__.py +46 -0
- dstack/api/_public/__init__.py +96 -0
- dstack/api/_public/backends.py +42 -0
- dstack/api/_public/common.py +5 -0
- dstack/api/_public/repos.py +202 -0
- dstack/api/_public/runs.py +714 -0
- dstack/api/server/__init__.py +206 -0
- dstack/api/server/_auth.py +30 -0
- dstack/api/server/_backends.py +38 -0
- dstack/api/server/_events.py +64 -0
- dstack/api/server/_files.py +18 -0
- dstack/api/server/_fleets.py +82 -0
- dstack/api/server/_gateways.py +54 -0
- dstack/api/server/_gpus.py +27 -0
- dstack/api/server/_group.py +22 -0
- dstack/api/server/_logs.py +15 -0
- dstack/api/server/_metrics.py +23 -0
- dstack/api/server/_projects.py +124 -0
- dstack/api/server/_repos.py +64 -0
- dstack/api/server/_runs.py +102 -0
- dstack/api/server/_secrets.py +36 -0
- dstack/api/server/_users.py +82 -0
- dstack/api/server/_volumes.py +39 -0
- dstack/api/server/utils.py +34 -0
- dstack/api/utils.py +105 -0
- dstack/core/__init__.py +0 -0
- dstack/plugins/__init__.py +8 -0
- dstack/plugins/_base.py +72 -0
- dstack/plugins/_models.py +8 -0
- dstack/plugins/_utils.py +19 -0
- dstack/plugins/builtin/__init__.py +0 -0
- dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
- dstack/plugins/builtin/rest_plugin/_models.py +48 -0
- dstack/plugins/builtin/rest_plugin/_plugin.py +147 -0
- dstack/version.py +3 -1
- dstack-0.20.7.dist-info/METADATA +519 -0
- dstack-0.20.7.dist-info/RECORD +720 -0
- {dstack-0.0.9.dist-info → dstack-0.20.7.dist-info}/WHEEL +1 -2
- dstack-0.20.7.dist-info/entry_points.txt +2 -0
- dstack-0.20.7.dist-info/licenses/LICENSE.md +353 -0
- dstack/aws/__init__.py +0 -180
- dstack/aws/artifacts.py +0 -111
- dstack/aws/config.py +0 -40
- dstack/aws/jobs.py +0 -245
- dstack/aws/logs.py +0 -186
- dstack/aws/repos.py +0 -137
- dstack/aws/run_names.py +0 -17
- dstack/aws/runners.py +0 -693
- dstack/aws/runs.py +0 -79
- dstack/aws/secrets.py +0 -99
- dstack/aws/tags.py +0 -138
- dstack/backend.py +0 -299
- dstack/cli/app.py +0 -41
- dstack/cli/artifacts.py +0 -87
- dstack/cli/common.py +0 -57
- dstack/cli/config.py +0 -194
- dstack/cli/dashboard.py +0 -26
- dstack/cli/delete.py +0 -49
- dstack/cli/init.py +0 -33
- dstack/cli/logs.py +0 -87
- dstack/cli/main.py +0 -81
- dstack/cli/restart.py +0 -43
- dstack/cli/run.py +0 -223
- dstack/cli/schema.py +0 -46
- dstack/cli/secrets.py +0 -97
- dstack/cli/status.py +0 -140
- dstack/cli/stop.py +0 -53
- dstack/cli/tags.py +0 -100
- dstack/config.py +0 -80
- dstack/dashboard/artifacts.py +0 -26
- dstack/dashboard/logs.py +0 -73
- dstack/dashboard/main.py +0 -45
- dstack/dashboard/repos.py +0 -41
- dstack/dashboard/runs.py +0 -140
- dstack/dashboard/secrets.py +0 -53
- dstack/dashboard/statics/4d6a4e032505c1efd23c.png +0 -0
- dstack/dashboard/statics/7e018c3e5566d7c349a8.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-144x144.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-192x192.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-256x256.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-36x36.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-384x384.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-48x48.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-512x512.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-72x72.png +0 -0
- dstack/dashboard/statics/assets/android-chrome-96x96.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-1024x1024.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-114x114.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-120x120.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-144x144.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-152x152.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-167x167.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-180x180.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-57x57.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-60x60.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-72x72.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-76x76.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon-precomposed.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-icon.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1125x2436.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1136x640.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2208.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1242x2688.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1334x750.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1536x2048.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1620x2160.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2224.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1668x2388.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-1792x828.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2048x1536.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2048x2732.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2160x1620.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2208x1242.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2224x1668.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2388x1668.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2436x1125.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2688x1242.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-2732x2048.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-640x1136.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-750x1334.png +0 -0
- dstack/dashboard/statics/assets/apple-touch-startup-image-828x1792.png +0 -0
- dstack/dashboard/statics/assets/browserconfig.xml +0 -15
- dstack/dashboard/statics/assets/coast-228x228.png +0 -0
- dstack/dashboard/statics/assets/favicon-16x16.png +0 -0
- dstack/dashboard/statics/assets/favicon-32x32.png +0 -0
- dstack/dashboard/statics/assets/favicon-48x48.png +0 -0
- dstack/dashboard/statics/assets/favicon.ico +0 -0
- dstack/dashboard/statics/assets/firefox_app_128x128.png +0 -0
- dstack/dashboard/statics/assets/firefox_app_512x512.png +0 -0
- dstack/dashboard/statics/assets/firefox_app_60x60.png +0 -0
- dstack/dashboard/statics/assets/manifest.webapp +0 -14
- dstack/dashboard/statics/assets/mstile-144x144.png +0 -0
- dstack/dashboard/statics/assets/mstile-150x150.png +0 -0
- dstack/dashboard/statics/assets/mstile-310x150.png +0 -0
- dstack/dashboard/statics/assets/mstile-310x310.png +0 -0
- dstack/dashboard/statics/assets/mstile-70x70.png +0 -0
- dstack/dashboard/statics/assets/yandex-browser-50x50.png +0 -0
- dstack/dashboard/statics/d0f71e48806e25d72553.png +0 -0
- dstack/dashboard/statics/index.html +0 -7
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js +0 -3
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.LICENSE.txt +0 -102
- dstack/dashboard/statics/main-1d87e34eb0454da8ebb4.js.map +0 -1
- dstack/dashboard/statics/main.css +0 -5058
- dstack/dashboard/statics/splash_thumbnail.png +0 -0
- dstack/dashboard/statics/static/media/check.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
- dstack/dashboard/statics/static/media/chevron-down.bfd8f22c4a5db4d443e76bca3b02f334.svg +0 -3
- dstack/dashboard/statics/static/media/chevron-up.bade0c5d82d741cead615813264140c9.svg +0 -3
- dstack/dashboard/statics/static/media/clock.583b744f29b9d143718a55e7c35fe38e.svg +0 -3
- dstack/dashboard/statics/static/media/close.a8bb9e47361b03a3b5084dad676ba1da.svg +0 -3
- dstack/dashboard/statics/static/media/content-copy.73f5f2a175094757758e315243a4111e.svg +0 -3
- dstack/dashboard/statics/static/media/delete-outline.6a8abf4e4f9cb777781967efd56efe9b.svg +0 -3
- dstack/dashboard/statics/static/media/dots-vertical.82fc618192e0c7dc4d615ff93269246a.svg +0 -3
- dstack/dashboard/statics/static/media/earth.1ad57c7f59f4be5c8bb2fa00439c3149.svg +0 -3
- dstack/dashboard/statics/static/media/email.320bc3af24a5f1bb41ebd85f66a5dd70.svg +0 -3
- dstack/dashboard/statics/static/media/external-link.99b88e699c15afb820a1779d9a2261ed.svg +0 -3
- dstack/dashboard/statics/static/media/eye-off-outline.5b4afb7ad624a44dd307518ff93d1faa.svg +0 -3
- dstack/dashboard/statics/static/media/eye-outline.ca41708feaaed1edb15c5fff021fbafe.svg +0 -3
- dstack/dashboard/statics/static/media/file-download-outline.3634b41923ba79b297ff294ef898661c.svg +0 -3
- dstack/dashboard/statics/static/media/folder-outline.33378387af61821dd1207e4b2d061a07.svg +0 -3
- dstack/dashboard/statics/static/media/github-circle.1bb85d171c31a3c2eebad07319377171.svg +0 -3
- dstack/dashboard/statics/static/media/infinity.915f92939afc0a37f94adba211ceb172.svg +0 -3
- dstack/dashboard/statics/static/media/layers.b4b02cea267a617d7aa44c2719250c89.svg +0 -3
- dstack/dashboard/statics/static/media/linkedin.1c52fae553eee54397f0e63a79455a5e.svg +0 -3
- dstack/dashboard/statics/static/media/loading.e466be7b2c1f0ac9e7e51ca929d0e37d.svg +0 -3
- dstack/dashboard/statics/static/media/lock.4a4c7768d0fa60c716609ddc483470ef.svg +0 -3
- dstack/dashboard/statics/static/media/magnify.0c803314d039d21f3cb1504ccd1437a4.svg +0 -3
- dstack/dashboard/statics/static/media/mark.3f68ffc787a15c0476793a6d18ecb71a.svg +0 -3
- dstack/dashboard/statics/static/media/menu-close.3ee84714181017c6ff837830297c8437.svg +0 -3
- dstack/dashboard/statics/static/media/menu.922f81e0972fbcbb5adcd8def20c86a3.svg +0 -3
- dstack/dashboard/statics/static/media/pencil.f706a3b9dcbff4959a91bf72e1e6324f.svg +0 -3
- dstack/dashboard/statics/static/media/refresh.a80edb948e98b322cd73b67814a57a48.svg +0 -3
- dstack/dashboard/statics/static/media/shape-plus.63b093c7f4b44c3def774f30fcfbceca.svg +0 -3
- dstack/dashboard/statics/static/media/slack.ec2fca99c6b944950ac65404ddd26880.svg +0 -4
- dstack/dashboard/statics/static/media/small-logo.b9cc8d09f646a553e65fa336dafd8b10.svg +0 -116
- dstack/dashboard/statics/static/media/source-branch.b8d22cfc42a7bed81f0fc08130818e85.svg +0 -3
- dstack/dashboard/statics/static/media/source-commit.be2bb53c081b9b6836adffccc0b8d3e6.svg +0 -3
- dstack/dashboard/statics/static/media/stop.11488ff1437ad929476be8924a3b7075.svg +0 -3
- dstack/dashboard/statics/static/media/tag-minus.15680a815b0b8d027e973c84832c05e6.svg +0 -3
- dstack/dashboard/statics/static/media/tag-outline.19b0bf86a8afd7d6d9c716e9a91d94ca.svg +0 -3
- dstack/dashboard/statics/static/media/twitter.4af18861c84a2f3044c7546b55d5739c.svg +0 -3
- dstack/dashboard/tags.py +0 -119
- dstack/jobs.py +0 -255
- dstack/providers/__init__.py +0 -316
- dstack/providers/_python/main.py +0 -88
- dstack/providers/_tensorboard/main.py +0 -93
- dstack/providers/_torchrun/main.py +0 -121
- dstack/providers/bash/main.py +0 -90
- dstack/providers/code/main.py +0 -95
- dstack/providers/docker/main.py +0 -79
- dstack/providers/lab/main.py +0 -95
- dstack/providers/notebook/main.py +0 -90
- dstack/random_name.py +0 -29
- dstack/repo.py +0 -135
- dstack/runners.py +0 -35
- dstack/util.py +0 -15
- dstack-0.0.9.dist-info/METADATA +0 -176
- dstack-0.0.9.dist-info/RECORD +0 -179
- dstack-0.0.9.dist-info/entry_points.txt +0 -3
- dstack-0.0.9.dist-info/top_level.txt +0 -2
- tests/test_config.py +0 -70
- /dstack/{cli → _internal}/__init__.py +0 -0
- /dstack/{dashboard → _internal/cli}/__init__.py +0 -0
- /dstack/{providers/_python → _internal/cli/models}/__init__.py +0 -0
- /dstack/{providers/_tensorboard → _internal/cli/services}/__init__.py +0 -0
- /dstack/{providers/_torchrun → _internal/cli/utils}/__init__.py +0 -0
- /dstack/{providers/bash → _internal/core}/__init__.py +0 -0
- /dstack/{providers/code → _internal/core/backends}/__init__.py +0 -0
- /dstack/{providers/docker → _internal/core/backends/aws}/__init__.py +0 -0
- /dstack/{providers/lab → _internal/core/backends/azure}/__init__.py +0 -0
- /dstack/{providers/notebook → _internal/core/backends/base}/__init__.py +0 -0
- {tests → dstack/_internal/core/backends/cloudrift}/__init__.py +0 -0
- /dstack/{dashboard → _internal/server}/statics/assets/yandex-browser-manifest.json +0 -0
- /dstack/{dashboard → _internal/server}/statics/robots.txt +0 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import select
|
|
6
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
8
|
+
|
|
9
|
+
from dstack._internal.core.backends.base.compute import ComputeWithGroupProvisioningSupport
|
|
10
|
+
from dstack._internal.core.errors import BackendError
|
|
11
|
+
from dstack._internal.core.models.compute_groups import ComputeGroupStatus
|
|
12
|
+
from dstack._internal.core.models.instances import InstanceStatus
|
|
13
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
14
|
+
from dstack._internal.server.models import (
|
|
15
|
+
ComputeGroupModel,
|
|
16
|
+
ProjectModel,
|
|
17
|
+
)
|
|
18
|
+
from dstack._internal.server.services import backends as backends_services
|
|
19
|
+
from dstack._internal.server.services.compute_groups import compute_group_model_to_compute_group
|
|
20
|
+
from dstack._internal.server.services.instances import switch_instance_status
|
|
21
|
+
from dstack._internal.server.services.locking import get_locker
|
|
22
|
+
from dstack._internal.server.utils import sentry_utils
|
|
23
|
+
from dstack._internal.utils.common import get_current_datetime, run_async
|
|
24
|
+
from dstack._internal.utils.logging import get_logger
|
|
25
|
+
|
|
26
|
+
logger = get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
|
|
30
|
+
|
|
31
|
+
TERMINATION_RETRY_TIMEOUT = timedelta(seconds=60)
|
|
32
|
+
TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def process_compute_groups(batch_size: int = 1):
|
|
36
|
+
tasks = []
|
|
37
|
+
for _ in range(batch_size):
|
|
38
|
+
tasks.append(_process_next_compute_group())
|
|
39
|
+
await asyncio.gather(*tasks)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@sentry_utils.instrument_background_task
|
|
43
|
+
async def _process_next_compute_group():
|
|
44
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(ComputeGroupModel.__tablename__)
|
|
45
|
+
async with get_session_ctx() as session:
|
|
46
|
+
async with lock:
|
|
47
|
+
res = await session.execute(
|
|
48
|
+
select(ComputeGroupModel)
|
|
49
|
+
.where(
|
|
50
|
+
ComputeGroupModel.deleted == False,
|
|
51
|
+
ComputeGroupModel.id.not_in(lockset),
|
|
52
|
+
ComputeGroupModel.last_processed_at
|
|
53
|
+
< get_current_datetime() - MIN_PROCESSING_INTERVAL,
|
|
54
|
+
)
|
|
55
|
+
.options(load_only(ComputeGroupModel.id))
|
|
56
|
+
.order_by(ComputeGroupModel.last_processed_at.asc())
|
|
57
|
+
.limit(1)
|
|
58
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
59
|
+
)
|
|
60
|
+
compute_group_model = res.scalar()
|
|
61
|
+
if compute_group_model is None:
|
|
62
|
+
return
|
|
63
|
+
compute_group_model_id = compute_group_model.id
|
|
64
|
+
lockset.add(compute_group_model_id)
|
|
65
|
+
try:
|
|
66
|
+
await _process_compute_group(
|
|
67
|
+
session=session,
|
|
68
|
+
compute_group_model=compute_group_model,
|
|
69
|
+
)
|
|
70
|
+
finally:
|
|
71
|
+
lockset.difference_update([compute_group_model_id])
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def _process_compute_group(session: AsyncSession, compute_group_model: ComputeGroupModel):
|
|
75
|
+
# Refetch to load related attributes.
|
|
76
|
+
res = await session.execute(
|
|
77
|
+
select(ComputeGroupModel)
|
|
78
|
+
.where(ComputeGroupModel.id == compute_group_model.id)
|
|
79
|
+
.options(
|
|
80
|
+
joinedload(ComputeGroupModel.instances),
|
|
81
|
+
joinedload(ComputeGroupModel.project).joinedload(ProjectModel.backends),
|
|
82
|
+
)
|
|
83
|
+
.execution_options(populate_existing=True)
|
|
84
|
+
)
|
|
85
|
+
compute_group_model = res.unique().scalar_one()
|
|
86
|
+
if all(i.status == InstanceStatus.TERMINATING for i in compute_group_model.instances):
|
|
87
|
+
await _terminate_compute_group(session, compute_group_model)
|
|
88
|
+
compute_group_model.last_processed_at = get_current_datetime()
|
|
89
|
+
await session.commit()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
async def _terminate_compute_group(
|
|
93
|
+
session: AsyncSession, compute_group_model: ComputeGroupModel
|
|
94
|
+
) -> None:
|
|
95
|
+
if (
|
|
96
|
+
compute_group_model.last_termination_retry_at is not None
|
|
97
|
+
and _next_termination_retry_at(compute_group_model) > get_current_datetime()
|
|
98
|
+
):
|
|
99
|
+
return
|
|
100
|
+
compute_group = compute_group_model_to_compute_group(compute_group_model)
|
|
101
|
+
cgpd = compute_group.provisioning_data
|
|
102
|
+
backend = await backends_services.get_project_backend_by_type(
|
|
103
|
+
project=compute_group_model.project,
|
|
104
|
+
backend_type=cgpd.backend,
|
|
105
|
+
)
|
|
106
|
+
if backend is None:
|
|
107
|
+
logger.error(
|
|
108
|
+
"Failed to terminate compute group %s. Backend %s not available.",
|
|
109
|
+
compute_group.name,
|
|
110
|
+
cgpd.backend,
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
logger.debug("Terminating compute group %s", compute_group.name)
|
|
114
|
+
compute = backend.compute()
|
|
115
|
+
assert isinstance(compute, ComputeWithGroupProvisioningSupport)
|
|
116
|
+
try:
|
|
117
|
+
await run_async(
|
|
118
|
+
compute.terminate_compute_group,
|
|
119
|
+
compute_group,
|
|
120
|
+
)
|
|
121
|
+
except Exception as e:
|
|
122
|
+
if compute_group_model.first_termination_retry_at is None:
|
|
123
|
+
compute_group_model.first_termination_retry_at = get_current_datetime()
|
|
124
|
+
compute_group_model.last_termination_retry_at = get_current_datetime()
|
|
125
|
+
if _next_termination_retry_at(compute_group_model) < _get_termination_deadline(
|
|
126
|
+
compute_group_model
|
|
127
|
+
):
|
|
128
|
+
logger.warning(
|
|
129
|
+
"Failed to terminate compute group %s. Will retry. Error: %r",
|
|
130
|
+
compute_group.name,
|
|
131
|
+
e,
|
|
132
|
+
exc_info=not isinstance(e, BackendError),
|
|
133
|
+
)
|
|
134
|
+
return
|
|
135
|
+
logger.error(
|
|
136
|
+
"Failed all attempts to terminate compute group %s."
|
|
137
|
+
" Please terminate it manually to avoid unexpected charges."
|
|
138
|
+
" Error: %r",
|
|
139
|
+
compute_group.name,
|
|
140
|
+
e,
|
|
141
|
+
exc_info=not isinstance(e, BackendError),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
compute_group_model.deleted = True
|
|
145
|
+
compute_group_model.deleted_at = get_current_datetime()
|
|
146
|
+
compute_group_model.status = ComputeGroupStatus.TERMINATED
|
|
147
|
+
# Terminating instances belonging to a compute group are locked implicitly
|
|
148
|
+
# by locking the compute group.
|
|
149
|
+
for instance_model in compute_group_model.instances:
|
|
150
|
+
instance_model.deleted = True
|
|
151
|
+
instance_model.deleted_at = get_current_datetime()
|
|
152
|
+
instance_model.finished_at = get_current_datetime()
|
|
153
|
+
switch_instance_status(session, instance_model, InstanceStatus.TERMINATED)
|
|
154
|
+
logger.info(
|
|
155
|
+
"Terminated compute group %s",
|
|
156
|
+
compute_group.name,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _next_termination_retry_at(compute_group_model: ComputeGroupModel) -> datetime.datetime:
|
|
161
|
+
assert compute_group_model.last_termination_retry_at is not None
|
|
162
|
+
return compute_group_model.last_termination_retry_at + TERMINATION_RETRY_TIMEOUT
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _get_termination_deadline(compute_group_model: ComputeGroupModel) -> datetime.datetime:
|
|
166
|
+
assert compute_group_model.first_termination_retry_at is not None
|
|
167
|
+
return compute_group_model.first_termination_retry_at + TERMINATION_RETRY_MAX_DURATION
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from datetime import timedelta
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import delete
|
|
4
|
+
|
|
5
|
+
from dstack._internal.server import settings
|
|
6
|
+
from dstack._internal.server.db import get_session_ctx
|
|
7
|
+
from dstack._internal.server.models import EventModel
|
|
8
|
+
from dstack._internal.server.utils import sentry_utils
|
|
9
|
+
from dstack._internal.utils.common import get_current_datetime
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@sentry_utils.instrument_background_task
|
|
13
|
+
async def delete_events():
|
|
14
|
+
cutoff = get_current_datetime() - timedelta(seconds=settings.SERVER_EVENTS_TTL_SECONDS)
|
|
15
|
+
stmt = delete(EventModel).where(EventModel.recorded_at < cutoff)
|
|
16
|
+
async with get_session_ctx() as session:
|
|
17
|
+
await session.execute(stmt)
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from datetime import timedelta
|
|
3
|
+
from typing import List
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
from sqlalchemy import select, update
|
|
7
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
8
|
+
from sqlalchemy.orm import joinedload, load_only, selectinload, with_loader_criteria
|
|
9
|
+
|
|
10
|
+
from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
|
|
11
|
+
from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason
|
|
12
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
13
|
+
from dstack._internal.server.models import (
|
|
14
|
+
FleetModel,
|
|
15
|
+
InstanceModel,
|
|
16
|
+
JobModel,
|
|
17
|
+
PlacementGroupModel,
|
|
18
|
+
RunModel,
|
|
19
|
+
)
|
|
20
|
+
from dstack._internal.server.services import events
|
|
21
|
+
from dstack._internal.server.services.fleets import (
|
|
22
|
+
create_fleet_instance_model,
|
|
23
|
+
get_fleet_spec,
|
|
24
|
+
get_next_instance_num,
|
|
25
|
+
is_fleet_empty,
|
|
26
|
+
is_fleet_in_use,
|
|
27
|
+
switch_fleet_status,
|
|
28
|
+
)
|
|
29
|
+
from dstack._internal.server.services.instances import switch_instance_status
|
|
30
|
+
from dstack._internal.server.services.locking import get_locker
|
|
31
|
+
from dstack._internal.server.utils import sentry_utils
|
|
32
|
+
from dstack._internal.utils.common import get_current_datetime
|
|
33
|
+
from dstack._internal.utils.logging import get_logger
|
|
34
|
+
|
|
35
|
+
logger = get_logger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
BATCH_SIZE = 10
|
|
39
|
+
MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@sentry_utils.instrument_background_task
|
|
43
|
+
async def process_fleets():
|
|
44
|
+
fleet_lock, fleet_lockset = get_locker(get_db().dialect_name).get_lockset(
|
|
45
|
+
FleetModel.__tablename__
|
|
46
|
+
)
|
|
47
|
+
instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
|
|
48
|
+
InstanceModel.__tablename__
|
|
49
|
+
)
|
|
50
|
+
async with get_session_ctx() as session:
|
|
51
|
+
async with fleet_lock, instance_lock:
|
|
52
|
+
res = await session.execute(
|
|
53
|
+
select(FleetModel)
|
|
54
|
+
.where(
|
|
55
|
+
FleetModel.deleted == False,
|
|
56
|
+
FleetModel.id.not_in(fleet_lockset),
|
|
57
|
+
FleetModel.last_processed_at
|
|
58
|
+
< get_current_datetime() - MIN_PROCESSING_INTERVAL,
|
|
59
|
+
)
|
|
60
|
+
.options(
|
|
61
|
+
load_only(FleetModel.id, FleetModel.name),
|
|
62
|
+
selectinload(FleetModel.instances).load_only(InstanceModel.id),
|
|
63
|
+
with_loader_criteria(
|
|
64
|
+
InstanceModel, InstanceModel.deleted == False, include_aliases=True
|
|
65
|
+
),
|
|
66
|
+
)
|
|
67
|
+
.order_by(FleetModel.last_processed_at.asc())
|
|
68
|
+
.limit(BATCH_SIZE)
|
|
69
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
70
|
+
)
|
|
71
|
+
fleet_models = list(res.scalars().unique().all())
|
|
72
|
+
fleet_ids = [fm.id for fm in fleet_models]
|
|
73
|
+
res = await session.execute(
|
|
74
|
+
select(InstanceModel)
|
|
75
|
+
.where(
|
|
76
|
+
InstanceModel.id.not_in(instance_lockset),
|
|
77
|
+
InstanceModel.fleet_id.in_(fleet_ids),
|
|
78
|
+
InstanceModel.deleted == False,
|
|
79
|
+
)
|
|
80
|
+
.options(load_only(InstanceModel.id, InstanceModel.fleet_id))
|
|
81
|
+
.order_by(InstanceModel.id)
|
|
82
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
83
|
+
)
|
|
84
|
+
instance_models = list(res.scalars().all())
|
|
85
|
+
fleet_id_to_locked_instances = defaultdict(list)
|
|
86
|
+
for instance_model in instance_models:
|
|
87
|
+
fleet_id_to_locked_instances[instance_model.fleet_id].append(instance_model)
|
|
88
|
+
# Process only fleets with all instances locked.
|
|
89
|
+
# Other fleets won't be processed but will still be locked to avoid new transaction.
|
|
90
|
+
# This should not be problematic as long as process_fleets is quick.
|
|
91
|
+
fleet_models_to_process = []
|
|
92
|
+
for fleet_model in fleet_models:
|
|
93
|
+
if len(fleet_model.instances) == len(fleet_id_to_locked_instances[fleet_model.id]):
|
|
94
|
+
fleet_models_to_process.append(fleet_model)
|
|
95
|
+
else:
|
|
96
|
+
logger.debug(
|
|
97
|
+
"Fleet %s processing will be skipped: some instance were not locked",
|
|
98
|
+
fleet_model.name,
|
|
99
|
+
)
|
|
100
|
+
for fleet_id in fleet_ids:
|
|
101
|
+
fleet_lockset.add(fleet_id)
|
|
102
|
+
instance_ids = [im.id for im in instance_models]
|
|
103
|
+
for instance_id in instance_ids:
|
|
104
|
+
instance_lockset.add(instance_id)
|
|
105
|
+
try:
|
|
106
|
+
await _process_fleets(session=session, fleet_models=fleet_models_to_process)
|
|
107
|
+
finally:
|
|
108
|
+
fleet_lockset.difference_update(fleet_ids)
|
|
109
|
+
instance_lockset.difference_update(instance_ids)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]):
|
|
113
|
+
fleet_ids = [fm.id for fm in fleet_models]
|
|
114
|
+
# Refetch to load related attributes.
|
|
115
|
+
res = await session.execute(
|
|
116
|
+
select(FleetModel)
|
|
117
|
+
.where(FleetModel.id.in_(fleet_ids))
|
|
118
|
+
.options(
|
|
119
|
+
joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id),
|
|
120
|
+
with_loader_criteria(
|
|
121
|
+
InstanceModel, InstanceModel.deleted == False, include_aliases=True
|
|
122
|
+
),
|
|
123
|
+
)
|
|
124
|
+
.options(joinedload(FleetModel.project))
|
|
125
|
+
.options(joinedload(FleetModel.runs).load_only(RunModel.status))
|
|
126
|
+
.execution_options(populate_existing=True)
|
|
127
|
+
)
|
|
128
|
+
fleet_models = list(res.unique().scalars().all())
|
|
129
|
+
|
|
130
|
+
# TODO: Drop fleets auto-deletion after dropping fleets auto-creation.
|
|
131
|
+
deleted_fleets_ids = []
|
|
132
|
+
for fleet_model in fleet_models:
|
|
133
|
+
_consolidate_fleet_state_with_spec(session, fleet_model)
|
|
134
|
+
deleted = _autodelete_fleet(session, fleet_model)
|
|
135
|
+
if deleted:
|
|
136
|
+
deleted_fleets_ids.append(fleet_model.id)
|
|
137
|
+
fleet_model.last_processed_at = get_current_datetime()
|
|
138
|
+
await _update_deleted_fleets_placement_groups(session, deleted_fleets_ids)
|
|
139
|
+
await session.commit()
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: FleetModel):
|
|
143
|
+
if fleet_model.status == FleetStatus.TERMINATING:
|
|
144
|
+
return
|
|
145
|
+
fleet_spec = get_fleet_spec(fleet_model)
|
|
146
|
+
if fleet_spec.configuration.nodes is None or fleet_spec.autocreated:
|
|
147
|
+
# Only explicitly created cloud fleets are consolidated.
|
|
148
|
+
return
|
|
149
|
+
if not _is_fleet_ready_for_consolidation(fleet_model):
|
|
150
|
+
return
|
|
151
|
+
changed_instances = _maintain_fleet_nodes_in_min_max_range(session, fleet_model, fleet_spec)
|
|
152
|
+
if changed_instances:
|
|
153
|
+
fleet_model.consolidation_attempt += 1
|
|
154
|
+
else:
|
|
155
|
+
# The fleet is already consolidated or consolidation is in progress.
|
|
156
|
+
# We reset consolidation_attempt in both cases for simplicity.
|
|
157
|
+
# The second case does not need reset but is ok to do since
|
|
158
|
+
# it means consolidation is longer than delay, so it won't happen too often.
|
|
159
|
+
# TODO: Reset consolidation_attempt on fleet in-place update.
|
|
160
|
+
fleet_model.consolidation_attempt = 0
|
|
161
|
+
fleet_model.last_consolidated_at = get_current_datetime()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _is_fleet_ready_for_consolidation(fleet_model: FleetModel) -> bool:
|
|
165
|
+
consolidation_retry_delay = _get_consolidation_retry_delay(fleet_model.consolidation_attempt)
|
|
166
|
+
last_consolidated_at = fleet_model.last_consolidated_at or fleet_model.last_processed_at
|
|
167
|
+
duration_since_last_consolidation = get_current_datetime() - last_consolidated_at
|
|
168
|
+
return duration_since_last_consolidation >= consolidation_retry_delay
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# We use exponentially increasing consolidation retry delays so that
|
|
172
|
+
# consolidation does not happen too often. In particular, this prevents
|
|
173
|
+
# retrying instance provisioning constantly in case of no offers.
|
|
174
|
+
# TODO: Adjust delays.
|
|
175
|
+
_CONSOLIDATION_RETRY_DELAYS = [
|
|
176
|
+
timedelta(seconds=30),
|
|
177
|
+
timedelta(minutes=1),
|
|
178
|
+
timedelta(minutes=2),
|
|
179
|
+
timedelta(minutes=5),
|
|
180
|
+
timedelta(minutes=10),
|
|
181
|
+
]
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
|
|
185
|
+
if consolidation_attempt < len(_CONSOLIDATION_RETRY_DELAYS):
|
|
186
|
+
return _CONSOLIDATION_RETRY_DELAYS[consolidation_attempt]
|
|
187
|
+
return _CONSOLIDATION_RETRY_DELAYS[-1]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _maintain_fleet_nodes_in_min_max_range(
|
|
191
|
+
session: AsyncSession,
|
|
192
|
+
fleet_model: FleetModel,
|
|
193
|
+
fleet_spec: FleetSpec,
|
|
194
|
+
) -> bool:
|
|
195
|
+
"""
|
|
196
|
+
Ensures the fleet has at least `nodes.min` and at most `nodes.max` instances.
|
|
197
|
+
Returns `True` if retried, added new instances, or terminated redundant instances and `False` otherwise.
|
|
198
|
+
"""
|
|
199
|
+
assert fleet_spec.configuration.nodes is not None
|
|
200
|
+
for instance in fleet_model.instances:
|
|
201
|
+
# Delete terminated but not deleted instances since
|
|
202
|
+
# they are going to be replaced with new pending instances.
|
|
203
|
+
if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
|
|
204
|
+
instance.deleted = True
|
|
205
|
+
instance.deleted_at = get_current_datetime()
|
|
206
|
+
active_instances = [i for i in fleet_model.instances if not i.deleted]
|
|
207
|
+
active_instances_num = len(active_instances)
|
|
208
|
+
if active_instances_num >= fleet_spec.configuration.nodes.min:
|
|
209
|
+
if (
|
|
210
|
+
fleet_spec.configuration.nodes.max is None
|
|
211
|
+
or active_instances_num <= fleet_spec.configuration.nodes.max
|
|
212
|
+
):
|
|
213
|
+
return False
|
|
214
|
+
# Fleet has more instances than allowed by nodes.max.
|
|
215
|
+
# This is possible due to race conditions (e.g. provisioning jobs in a fleet concurrently)
|
|
216
|
+
# or if nodes.max is updated.
|
|
217
|
+
nodes_redundant = active_instances_num - fleet_spec.configuration.nodes.max
|
|
218
|
+
for instance in fleet_model.instances:
|
|
219
|
+
if nodes_redundant == 0:
|
|
220
|
+
break
|
|
221
|
+
if instance.status in [InstanceStatus.IDLE]:
|
|
222
|
+
instance.termination_reason = InstanceTerminationReason.MAX_INSTANCES_LIMIT
|
|
223
|
+
instance.termination_reason_message = "Fleet has too many instances"
|
|
224
|
+
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
|
|
225
|
+
nodes_redundant -= 1
|
|
226
|
+
return True
|
|
227
|
+
nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
|
|
228
|
+
for i in range(nodes_missing):
|
|
229
|
+
instance_model = create_fleet_instance_model(
|
|
230
|
+
session=session,
|
|
231
|
+
project=fleet_model.project,
|
|
232
|
+
# TODO: Store fleet.user and pass it instead of the project owner.
|
|
233
|
+
username=fleet_model.project.owner.name,
|
|
234
|
+
spec=fleet_spec,
|
|
235
|
+
instance_num=get_next_instance_num({i.instance_num for i in active_instances}),
|
|
236
|
+
)
|
|
237
|
+
events.emit(
|
|
238
|
+
session,
|
|
239
|
+
(
|
|
240
|
+
"Instance created to meet target fleet node count."
|
|
241
|
+
f" Status: {instance_model.status.upper()}"
|
|
242
|
+
),
|
|
243
|
+
actor=events.SystemActor(),
|
|
244
|
+
targets=[events.Target.from_model(instance_model)],
|
|
245
|
+
)
|
|
246
|
+
active_instances.append(instance_model)
|
|
247
|
+
fleet_model.instances.append(instance_model)
|
|
248
|
+
logger.info("Added %s instances to fleet %s", nodes_missing, fleet_model.name)
|
|
249
|
+
return True
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _autodelete_fleet(session: AsyncSession, fleet_model: FleetModel) -> bool:
|
|
253
|
+
if fleet_model.project.deleted:
|
|
254
|
+
# It used to be possible to delete project with active resources:
|
|
255
|
+
# https://github.com/dstackai/dstack/issues/3077
|
|
256
|
+
switch_fleet_status(session, fleet_model, FleetStatus.TERMINATED)
|
|
257
|
+
fleet_model.deleted = True
|
|
258
|
+
logger.info("Fleet %s deleted due to deleted project", fleet_model.name)
|
|
259
|
+
return True
|
|
260
|
+
|
|
261
|
+
if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model):
|
|
262
|
+
return False
|
|
263
|
+
|
|
264
|
+
fleet_spec = get_fleet_spec(fleet_model)
|
|
265
|
+
if (
|
|
266
|
+
fleet_model.status != FleetStatus.TERMINATING
|
|
267
|
+
and fleet_spec.configuration.nodes is not None
|
|
268
|
+
and fleet_spec.configuration.nodes.min == 0
|
|
269
|
+
):
|
|
270
|
+
# Empty fleets that allow 0 nodes should not be auto-deleted
|
|
271
|
+
return False
|
|
272
|
+
|
|
273
|
+
logger.info("Automatic cleanup of an empty fleet %s", fleet_model.name)
|
|
274
|
+
switch_fleet_status(session, fleet_model, FleetStatus.TERMINATED)
|
|
275
|
+
fleet_model.deleted = True
|
|
276
|
+
logger.info("Fleet %s deleted", fleet_model.name)
|
|
277
|
+
return True
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
async def _update_deleted_fleets_placement_groups(session: AsyncSession, fleets_ids: list[UUID]):
|
|
281
|
+
if len(fleets_ids) == 0:
|
|
282
|
+
return
|
|
283
|
+
await session.execute(
|
|
284
|
+
update(PlacementGroupModel)
|
|
285
|
+
.where(
|
|
286
|
+
PlacementGroupModel.fleet_id.in_(fleets_ids),
|
|
287
|
+
)
|
|
288
|
+
.values(fleet_deleted=True)
|
|
289
|
+
)
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import select
|
|
4
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
|
+
from sqlalchemy.orm import joinedload, lazyload
|
|
6
|
+
|
|
7
|
+
from dstack._internal.core.errors import BackendError, BackendNotAvailable, SSHError
|
|
8
|
+
from dstack._internal.core.models.gateways import GatewayStatus
|
|
9
|
+
from dstack._internal.server.db import get_db, get_session_ctx
|
|
10
|
+
from dstack._internal.server.models import GatewayComputeModel, GatewayModel, ProjectModel
|
|
11
|
+
from dstack._internal.server.services import backends as backends_services
|
|
12
|
+
from dstack._internal.server.services import gateways as gateways_services
|
|
13
|
+
from dstack._internal.server.services.gateways import (
|
|
14
|
+
GatewayConnection,
|
|
15
|
+
create_gateway_compute,
|
|
16
|
+
gateway_connections_pool,
|
|
17
|
+
switch_gateway_status,
|
|
18
|
+
)
|
|
19
|
+
from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker
|
|
20
|
+
from dstack._internal.server.services.logging import fmt
|
|
21
|
+
from dstack._internal.server.utils import sentry_utils
|
|
22
|
+
from dstack._internal.utils.common import get_current_datetime
|
|
23
|
+
from dstack._internal.utils.logging import get_logger
|
|
24
|
+
|
|
25
|
+
logger = get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def process_gateways_connections():
|
|
29
|
+
await _remove_inactive_connections()
|
|
30
|
+
await _process_active_connections()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@sentry_utils.instrument_background_task
|
|
34
|
+
async def process_gateways():
|
|
35
|
+
lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
|
|
36
|
+
async with get_session_ctx() as session:
|
|
37
|
+
async with lock:
|
|
38
|
+
res = await session.execute(
|
|
39
|
+
select(GatewayModel)
|
|
40
|
+
.where(
|
|
41
|
+
GatewayModel.status.in_([GatewayStatus.SUBMITTED, GatewayStatus.PROVISIONING]),
|
|
42
|
+
GatewayModel.id.not_in(lockset),
|
|
43
|
+
)
|
|
44
|
+
.options(lazyload(GatewayModel.gateway_compute))
|
|
45
|
+
.order_by(GatewayModel.last_processed_at.asc())
|
|
46
|
+
.limit(1)
|
|
47
|
+
.with_for_update(skip_locked=True, key_share=True)
|
|
48
|
+
)
|
|
49
|
+
gateway_model = res.scalar()
|
|
50
|
+
if gateway_model is None:
|
|
51
|
+
return
|
|
52
|
+
lockset.add(gateway_model.id)
|
|
53
|
+
gateway_model_id = gateway_model.id
|
|
54
|
+
try:
|
|
55
|
+
initial_status = gateway_model.status
|
|
56
|
+
if initial_status == GatewayStatus.SUBMITTED:
|
|
57
|
+
await _process_submitted_gateway(session=session, gateway_model=gateway_model)
|
|
58
|
+
elif initial_status == GatewayStatus.PROVISIONING:
|
|
59
|
+
await _process_provisioning_gateway(session=session, gateway_model=gateway_model)
|
|
60
|
+
else:
|
|
61
|
+
logger.error(
|
|
62
|
+
"%s: unexpected gateway status %r", fmt(gateway_model), initial_status.upper()
|
|
63
|
+
)
|
|
64
|
+
gateway_model.last_processed_at = get_current_datetime()
|
|
65
|
+
await session.commit()
|
|
66
|
+
finally:
|
|
67
|
+
lockset.difference_update([gateway_model_id])
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def _remove_inactive_connections():
|
|
71
|
+
async with get_session_ctx() as session:
|
|
72
|
+
res = await session.execute(
|
|
73
|
+
select(GatewayComputeModel.ip_address).where(GatewayComputeModel.active == True)
|
|
74
|
+
)
|
|
75
|
+
active_connection_ips = set(res.scalars().all())
|
|
76
|
+
for conn in await gateway_connections_pool.all():
|
|
77
|
+
if conn.ip_address not in active_connection_ips:
|
|
78
|
+
await gateway_connections_pool.remove(conn.ip_address)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def _process_active_connections():
|
|
82
|
+
connections = await gateway_connections_pool.all()
|
|
83
|
+
# Two server processes on a single host cannot process
|
|
84
|
+
# gateway connections and init gateway connections concurrently:
|
|
85
|
+
# Race conditions cause conflicting tunnels being opened.
|
|
86
|
+
async with get_session_ctx() as session:
|
|
87
|
+
async with advisory_lock_ctx(
|
|
88
|
+
bind=session,
|
|
89
|
+
dialect_name=get_db().dialect_name,
|
|
90
|
+
resource="gateway_tunnels",
|
|
91
|
+
):
|
|
92
|
+
await asyncio.gather(*(_process_connection(conn) for conn in connections))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
async def _process_connection(conn: GatewayConnection):
|
|
96
|
+
try:
|
|
97
|
+
await conn.check_or_restart()
|
|
98
|
+
except SSHError as e:
|
|
99
|
+
logger.error("Connection to gateway %s failed: %s", conn.ip_address, e)
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
await conn.try_collect_stats()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
async def _process_submitted_gateway(session: AsyncSession, gateway_model: GatewayModel):
|
|
106
|
+
logger.info("%s: started gateway provisioning", fmt(gateway_model))
|
|
107
|
+
# Refetch to load related attributes.
|
|
108
|
+
res = await session.execute(
|
|
109
|
+
select(GatewayModel)
|
|
110
|
+
.where(GatewayModel.id == gateway_model.id)
|
|
111
|
+
.options(joinedload(GatewayModel.project).joinedload(ProjectModel.backends))
|
|
112
|
+
.execution_options(populate_existing=True)
|
|
113
|
+
)
|
|
114
|
+
gateway_model = res.unique().scalar_one()
|
|
115
|
+
configuration = gateways_services.get_gateway_configuration(gateway_model)
|
|
116
|
+
try:
|
|
117
|
+
(
|
|
118
|
+
backend_model,
|
|
119
|
+
backend,
|
|
120
|
+
) = await backends_services.get_project_backend_with_model_by_type_or_error(
|
|
121
|
+
project=gateway_model.project, backend_type=configuration.backend
|
|
122
|
+
)
|
|
123
|
+
except BackendNotAvailable:
|
|
124
|
+
gateway_model.status_message = "Backend not available"
|
|
125
|
+
switch_gateway_status(session, gateway_model, GatewayStatus.FAILED)
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
gateway_model.gateway_compute = await create_gateway_compute(
|
|
130
|
+
backend_compute=backend.compute(),
|
|
131
|
+
project_name=gateway_model.project.name,
|
|
132
|
+
configuration=configuration,
|
|
133
|
+
backend_id=backend_model.id,
|
|
134
|
+
)
|
|
135
|
+
session.add(gateway_model)
|
|
136
|
+
switch_gateway_status(session, gateway_model, GatewayStatus.PROVISIONING)
|
|
137
|
+
except BackendError as e:
|
|
138
|
+
status_message = f"Backend error: {repr(e)}"
|
|
139
|
+
if len(e.args) > 0:
|
|
140
|
+
status_message = str(e.args[0])
|
|
141
|
+
gateway_model.status_message = status_message
|
|
142
|
+
switch_gateway_status(session, gateway_model, GatewayStatus.FAILED)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.exception("%s: got exception when creating gateway compute", fmt(gateway_model))
|
|
145
|
+
gateway_model.status_message = f"Unexpected error: {repr(e)}"
|
|
146
|
+
switch_gateway_status(session, gateway_model, GatewayStatus.FAILED)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
async def _process_provisioning_gateway(
|
|
150
|
+
session: AsyncSession, gateway_model: GatewayModel
|
|
151
|
+
) -> None:
|
|
152
|
+
# Refetch to load related attributes.
|
|
153
|
+
res = await session.execute(
|
|
154
|
+
select(GatewayModel)
|
|
155
|
+
.where(GatewayModel.id == gateway_model.id)
|
|
156
|
+
.execution_options(populate_existing=True)
|
|
157
|
+
)
|
|
158
|
+
gateway_model = res.unique().scalar_one()
|
|
159
|
+
|
|
160
|
+
# Provisioning gateways must have compute.
|
|
161
|
+
assert gateway_model.gateway_compute is not None
|
|
162
|
+
|
|
163
|
+
# FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway:
|
|
164
|
+
# - cannot delete the gateway before it is provisioned because the DB model is locked
|
|
165
|
+
# - connection retry counter is reset on server restart
|
|
166
|
+
# - only one server replica is processing the gateway
|
|
167
|
+
# Easy to fix by doing only one connection/configuration attempt per processing iteration. The
|
|
168
|
+
# main challenge is applying the same provisioning model to the dstack Sky gateway to avoid
|
|
169
|
+
# maintaining a different model for Sky.
|
|
170
|
+
connection = await gateways_services.connect_to_gateway_with_retry(
|
|
171
|
+
gateway_model.gateway_compute
|
|
172
|
+
)
|
|
173
|
+
if connection is None:
|
|
174
|
+
gateway_model.status_message = "Failed to connect to gateway"
|
|
175
|
+
switch_gateway_status(session, gateway_model, GatewayStatus.FAILED)
|
|
176
|
+
gateway_model.gateway_compute.deleted = True
|
|
177
|
+
return
|
|
178
|
+
try:
|
|
179
|
+
await gateways_services.configure_gateway(connection)
|
|
180
|
+
except Exception:
|
|
181
|
+
logger.exception("%s: failed to configure gateway", fmt(gateway_model))
|
|
182
|
+
gateway_model.status_message = "Failed to configure gateway"
|
|
183
|
+
switch_gateway_status(session, gateway_model, GatewayStatus.FAILED)
|
|
184
|
+
await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address)
|
|
185
|
+
gateway_model.gateway_compute.active = False
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
switch_gateway_status(session, gateway_model, GatewayStatus.RUNNING)
|