kubiya-control-plane-api 0.9.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- control_plane_api/LICENSE +676 -0
- control_plane_api/README.md +350 -0
- control_plane_api/__init__.py +4 -0
- control_plane_api/__version__.py +8 -0
- control_plane_api/alembic/README +1 -0
- control_plane_api/alembic/env.py +121 -0
- control_plane_api/alembic/script.py.mako +28 -0
- control_plane_api/alembic/versions/2613c65c3dbe_initial_database_setup.py +32 -0
- control_plane_api/alembic/versions/2df520d4927d_merge_heads.py +28 -0
- control_plane_api/alembic/versions/43abf98d6a01_add_paused_status_to_executions.py +73 -0
- control_plane_api/alembic/versions/6289854264cb_merge_multiple_heads.py +28 -0
- control_plane_api/alembic/versions/6a4d4dc3d8dc_generate_execution_transitions.py +50 -0
- control_plane_api/alembic/versions/87d11cf0a783_add_disconnected_status_to_worker_.py +44 -0
- control_plane_api/alembic/versions/add_ephemeral_queue_support.py +85 -0
- control_plane_api/alembic/versions/add_model_type_to_llm_models.py +31 -0
- control_plane_api/alembic/versions/add_plan_executions_table.py +114 -0
- control_plane_api/alembic/versions/add_trace_span_tables.py +154 -0
- control_plane_api/alembic/versions/add_user_info_to_traces.py +36 -0
- control_plane_api/alembic/versions/adjusting_foreign_keys.py +32 -0
- control_plane_api/alembic/versions/b4983d976db2_initial_tables.py +1128 -0
- control_plane_api/alembic/versions/d181a3b40e71_rename_custom_metadata_to_metadata_in_.py +50 -0
- control_plane_api/alembic/versions/df9117888e82_add_missing_columns.py +82 -0
- control_plane_api/alembic/versions/f25de6ad895a_missing_migrations.py +34 -0
- control_plane_api/alembic/versions/f71305fb69b9_fix_ephemeral_queue_deletion_foreign_key.py +54 -0
- control_plane_api/alembic/versions/mark_local_exec_queues_as_ephemeral.py +68 -0
- control_plane_api/alembic.ini +148 -0
- control_plane_api/api/index.py +12 -0
- control_plane_api/app/__init__.py +11 -0
- control_plane_api/app/activities/__init__.py +20 -0
- control_plane_api/app/activities/agent_activities.py +384 -0
- control_plane_api/app/activities/plan_generation_activities.py +499 -0
- control_plane_api/app/activities/team_activities.py +424 -0
- control_plane_api/app/activities/temporal_cloud_activities.py +588 -0
- control_plane_api/app/config/__init__.py +35 -0
- control_plane_api/app/config/api_config.py +469 -0
- control_plane_api/app/config/config_loader.py +224 -0
- control_plane_api/app/config/model_pricing.py +323 -0
- control_plane_api/app/config/storage_config.py +159 -0
- control_plane_api/app/config.py +115 -0
- control_plane_api/app/controllers/__init__.py +0 -0
- control_plane_api/app/controllers/execution_environment_controller.py +1315 -0
- control_plane_api/app/database.py +135 -0
- control_plane_api/app/exceptions.py +408 -0
- control_plane_api/app/lib/__init__.py +11 -0
- control_plane_api/app/lib/environment.py +65 -0
- control_plane_api/app/lib/event_bus/__init__.py +17 -0
- control_plane_api/app/lib/event_bus/base.py +136 -0
- control_plane_api/app/lib/event_bus/manager.py +335 -0
- control_plane_api/app/lib/event_bus/providers/__init__.py +6 -0
- control_plane_api/app/lib/event_bus/providers/http_provider.py +166 -0
- control_plane_api/app/lib/event_bus/providers/nats_provider.py +324 -0
- control_plane_api/app/lib/event_bus/providers/redis_provider.py +233 -0
- control_plane_api/app/lib/event_bus/providers/websocket_provider.py +497 -0
- control_plane_api/app/lib/job_executor.py +330 -0
- control_plane_api/app/lib/kubiya_client.py +293 -0
- control_plane_api/app/lib/litellm_pricing.py +166 -0
- control_plane_api/app/lib/mcp_validation.py +163 -0
- control_plane_api/app/lib/nats/__init__.py +13 -0
- control_plane_api/app/lib/nats/credentials_manager.py +288 -0
- control_plane_api/app/lib/nats/listener.py +374 -0
- control_plane_api/app/lib/planning_prompt_builder.py +153 -0
- control_plane_api/app/lib/planning_tools/__init__.py +41 -0
- control_plane_api/app/lib/planning_tools/agents.py +409 -0
- control_plane_api/app/lib/planning_tools/agno_toolkit.py +836 -0
- control_plane_api/app/lib/planning_tools/base.py +119 -0
- control_plane_api/app/lib/planning_tools/cognitive_memory_tools.py +403 -0
- control_plane_api/app/lib/planning_tools/context_graph_tools.py +545 -0
- control_plane_api/app/lib/planning_tools/environments.py +218 -0
- control_plane_api/app/lib/planning_tools/knowledge.py +204 -0
- control_plane_api/app/lib/planning_tools/models.py +93 -0
- control_plane_api/app/lib/planning_tools/planning_service.py +646 -0
- control_plane_api/app/lib/planning_tools/resources.py +242 -0
- control_plane_api/app/lib/planning_tools/teams.py +334 -0
- control_plane_api/app/lib/policy_enforcer_client.py +1016 -0
- control_plane_api/app/lib/redis_client.py +803 -0
- control_plane_api/app/lib/sqlalchemy_utils.py +486 -0
- control_plane_api/app/lib/state_transition_tools/__init__.py +7 -0
- control_plane_api/app/lib/state_transition_tools/execution_context.py +388 -0
- control_plane_api/app/lib/storage/__init__.py +20 -0
- control_plane_api/app/lib/storage/base_provider.py +274 -0
- control_plane_api/app/lib/storage/provider_factory.py +157 -0
- control_plane_api/app/lib/storage/vercel_blob_provider.py +468 -0
- control_plane_api/app/lib/supabase.py +71 -0
- control_plane_api/app/lib/supabase_utils.py +138 -0
- control_plane_api/app/lib/task_planning/__init__.py +138 -0
- control_plane_api/app/lib/task_planning/agent_factory.py +308 -0
- control_plane_api/app/lib/task_planning/agents.py +389 -0
- control_plane_api/app/lib/task_planning/cache.py +218 -0
- control_plane_api/app/lib/task_planning/entity_resolver.py +273 -0
- control_plane_api/app/lib/task_planning/helpers.py +293 -0
- control_plane_api/app/lib/task_planning/hooks.py +474 -0
- control_plane_api/app/lib/task_planning/models.py +503 -0
- control_plane_api/app/lib/task_planning/plan_validator.py +166 -0
- control_plane_api/app/lib/task_planning/planning_workflow.py +2911 -0
- control_plane_api/app/lib/task_planning/runner.py +656 -0
- control_plane_api/app/lib/task_planning/streaming_hook.py +213 -0
- control_plane_api/app/lib/task_planning/workflow.py +424 -0
- control_plane_api/app/lib/templating/__init__.py +88 -0
- control_plane_api/app/lib/templating/compiler.py +278 -0
- control_plane_api/app/lib/templating/engine.py +178 -0
- control_plane_api/app/lib/templating/parsers/__init__.py +29 -0
- control_plane_api/app/lib/templating/parsers/base.py +96 -0
- control_plane_api/app/lib/templating/parsers/env.py +85 -0
- control_plane_api/app/lib/templating/parsers/graph.py +112 -0
- control_plane_api/app/lib/templating/parsers/secret.py +87 -0
- control_plane_api/app/lib/templating/parsers/simple.py +81 -0
- control_plane_api/app/lib/templating/resolver.py +366 -0
- control_plane_api/app/lib/templating/types.py +214 -0
- control_plane_api/app/lib/templating/validator.py +201 -0
- control_plane_api/app/lib/temporal_client.py +232 -0
- control_plane_api/app/lib/temporal_credentials_cache.py +178 -0
- control_plane_api/app/lib/temporal_credentials_service.py +203 -0
- control_plane_api/app/lib/validation/__init__.py +24 -0
- control_plane_api/app/lib/validation/runtime_validation.py +388 -0
- control_plane_api/app/main.py +531 -0
- control_plane_api/app/middleware/__init__.py +10 -0
- control_plane_api/app/middleware/auth.py +645 -0
- control_plane_api/app/middleware/exception_handler.py +267 -0
- control_plane_api/app/middleware/prometheus_middleware.py +173 -0
- control_plane_api/app/middleware/rate_limiting.py +384 -0
- control_plane_api/app/middleware/request_id.py +202 -0
- control_plane_api/app/models/__init__.py +40 -0
- control_plane_api/app/models/agent.py +90 -0
- control_plane_api/app/models/analytics.py +206 -0
- control_plane_api/app/models/associations.py +107 -0
- control_plane_api/app/models/auth_user.py +73 -0
- control_plane_api/app/models/context.py +161 -0
- control_plane_api/app/models/custom_integration.py +99 -0
- control_plane_api/app/models/environment.py +64 -0
- control_plane_api/app/models/execution.py +125 -0
- control_plane_api/app/models/execution_transition.py +50 -0
- control_plane_api/app/models/job.py +159 -0
- control_plane_api/app/models/llm_model.py +78 -0
- control_plane_api/app/models/orchestration.py +66 -0
- control_plane_api/app/models/plan_execution.py +102 -0
- control_plane_api/app/models/presence.py +49 -0
- control_plane_api/app/models/project.py +61 -0
- control_plane_api/app/models/project_management.py +85 -0
- control_plane_api/app/models/session.py +29 -0
- control_plane_api/app/models/skill.py +155 -0
- control_plane_api/app/models/system_tables.py +43 -0
- control_plane_api/app/models/task_planning.py +372 -0
- control_plane_api/app/models/team.py +86 -0
- control_plane_api/app/models/trace.py +257 -0
- control_plane_api/app/models/user_profile.py +54 -0
- control_plane_api/app/models/worker.py +221 -0
- control_plane_api/app/models/workflow.py +161 -0
- control_plane_api/app/models/workspace.py +50 -0
- control_plane_api/app/observability/__init__.py +177 -0
- control_plane_api/app/observability/context_logging.py +475 -0
- control_plane_api/app/observability/decorators.py +337 -0
- control_plane_api/app/observability/local_span_processor.py +702 -0
- control_plane_api/app/observability/metrics.py +303 -0
- control_plane_api/app/observability/middleware.py +246 -0
- control_plane_api/app/observability/optional.py +115 -0
- control_plane_api/app/observability/tracing.py +382 -0
- control_plane_api/app/policies/README.md +149 -0
- control_plane_api/app/policies/approved_users.rego +62 -0
- control_plane_api/app/policies/business_hours.rego +51 -0
- control_plane_api/app/policies/rate_limiting.rego +100 -0
- control_plane_api/app/policies/tool_enforcement/README.md +336 -0
- control_plane_api/app/policies/tool_enforcement/bash_command_validation.rego +71 -0
- control_plane_api/app/policies/tool_enforcement/business_hours_enforcement.rego +82 -0
- control_plane_api/app/policies/tool_enforcement/mcp_tool_allowlist.rego +58 -0
- control_plane_api/app/policies/tool_enforcement/production_safeguards.rego +80 -0
- control_plane_api/app/policies/tool_enforcement/role_based_tool_access.rego +44 -0
- control_plane_api/app/policies/tool_restrictions.rego +86 -0
- control_plane_api/app/routers/__init__.py +4 -0
- control_plane_api/app/routers/agents.py +382 -0
- control_plane_api/app/routers/agents_v2.py +1598 -0
- control_plane_api/app/routers/analytics.py +1310 -0
- control_plane_api/app/routers/auth.py +59 -0
- control_plane_api/app/routers/client_config.py +57 -0
- control_plane_api/app/routers/context_graph.py +561 -0
- control_plane_api/app/routers/context_manager.py +577 -0
- control_plane_api/app/routers/custom_integrations.py +490 -0
- control_plane_api/app/routers/enforcer.py +132 -0
- control_plane_api/app/routers/environment_context.py +252 -0
- control_plane_api/app/routers/environments.py +761 -0
- control_plane_api/app/routers/execution_environment.py +847 -0
- control_plane_api/app/routers/executions/__init__.py +28 -0
- control_plane_api/app/routers/executions/router.py +286 -0
- control_plane_api/app/routers/executions/services/__init__.py +22 -0
- control_plane_api/app/routers/executions/services/demo_worker_health.py +156 -0
- control_plane_api/app/routers/executions/services/status_service.py +420 -0
- control_plane_api/app/routers/executions/services/test_worker_health.py +480 -0
- control_plane_api/app/routers/executions/services/worker_health.py +514 -0
- control_plane_api/app/routers/executions/streaming/__init__.py +22 -0
- control_plane_api/app/routers/executions/streaming/deduplication.py +352 -0
- control_plane_api/app/routers/executions/streaming/event_buffer.py +353 -0
- control_plane_api/app/routers/executions/streaming/event_formatter.py +964 -0
- control_plane_api/app/routers/executions/streaming/history_loader.py +588 -0
- control_plane_api/app/routers/executions/streaming/live_source.py +693 -0
- control_plane_api/app/routers/executions/streaming/streamer.py +849 -0
- control_plane_api/app/routers/executions.py +4888 -0
- control_plane_api/app/routers/health.py +165 -0
- control_plane_api/app/routers/health_v2.py +394 -0
- control_plane_api/app/routers/integration_templates.py +496 -0
- control_plane_api/app/routers/integrations.py +287 -0
- control_plane_api/app/routers/jobs.py +1809 -0
- control_plane_api/app/routers/metrics.py +517 -0
- control_plane_api/app/routers/models.py +82 -0
- control_plane_api/app/routers/models_v2.py +628 -0
- control_plane_api/app/routers/plan_executions.py +1481 -0
- control_plane_api/app/routers/plan_generation_async.py +304 -0
- control_plane_api/app/routers/policies.py +669 -0
- control_plane_api/app/routers/presence.py +234 -0
- control_plane_api/app/routers/projects.py +987 -0
- control_plane_api/app/routers/runners.py +379 -0
- control_plane_api/app/routers/runtimes.py +172 -0
- control_plane_api/app/routers/secrets.py +171 -0
- control_plane_api/app/routers/skills.py +1010 -0
- control_plane_api/app/routers/skills_definitions.py +140 -0
- control_plane_api/app/routers/storage.py +456 -0
- control_plane_api/app/routers/task_planning.py +611 -0
- control_plane_api/app/routers/task_queues.py +650 -0
- control_plane_api/app/routers/team_context.py +274 -0
- control_plane_api/app/routers/teams.py +1747 -0
- control_plane_api/app/routers/templates.py +248 -0
- control_plane_api/app/routers/traces.py +571 -0
- control_plane_api/app/routers/websocket_client.py +479 -0
- control_plane_api/app/routers/websocket_executions_status.py +437 -0
- control_plane_api/app/routers/websocket_gateway.py +323 -0
- control_plane_api/app/routers/websocket_traces.py +576 -0
- control_plane_api/app/routers/worker_queues.py +2555 -0
- control_plane_api/app/routers/worker_websocket.py +419 -0
- control_plane_api/app/routers/workers.py +1004 -0
- control_plane_api/app/routers/workflows.py +204 -0
- control_plane_api/app/runtimes/__init__.py +6 -0
- control_plane_api/app/runtimes/validation.py +344 -0
- control_plane_api/app/schemas/__init__.py +1 -0
- control_plane_api/app/schemas/job_schemas.py +302 -0
- control_plane_api/app/schemas/mcp_schemas.py +311 -0
- control_plane_api/app/schemas/template_schemas.py +133 -0
- control_plane_api/app/schemas/trace_schemas.py +168 -0
- control_plane_api/app/schemas/worker_queue_observability_schemas.py +165 -0
- control_plane_api/app/services/__init__.py +1 -0
- control_plane_api/app/services/agno_planning_strategy.py +233 -0
- control_plane_api/app/services/agno_service.py +838 -0
- control_plane_api/app/services/claude_code_planning_service.py +203 -0
- control_plane_api/app/services/context_graph_client.py +224 -0
- control_plane_api/app/services/custom_integration_service.py +415 -0
- control_plane_api/app/services/integration_resolution_service.py +345 -0
- control_plane_api/app/services/litellm_service.py +394 -0
- control_plane_api/app/services/plan_generator.py +79 -0
- control_plane_api/app/services/planning_strategy.py +66 -0
- control_plane_api/app/services/planning_strategy_factory.py +118 -0
- control_plane_api/app/services/policy_service.py +615 -0
- control_plane_api/app/services/state_transition_service.py +755 -0
- control_plane_api/app/services/storage_service.py +593 -0
- control_plane_api/app/services/temporal_cloud_provisioning.py +150 -0
- control_plane_api/app/services/toolsets/context_graph_skill.py +432 -0
- control_plane_api/app/services/trace_retention.py +354 -0
- control_plane_api/app/services/worker_queue_metrics_service.py +190 -0
- control_plane_api/app/services/workflow_cancellation_manager.py +135 -0
- control_plane_api/app/services/workflow_operations_service.py +611 -0
- control_plane_api/app/skills/__init__.py +100 -0
- control_plane_api/app/skills/base.py +239 -0
- control_plane_api/app/skills/builtin/__init__.py +37 -0
- control_plane_api/app/skills/builtin/agent_communication/__init__.py +8 -0
- control_plane_api/app/skills/builtin/agent_communication/skill.py +246 -0
- control_plane_api/app/skills/builtin/code_ingestion/__init__.py +4 -0
- control_plane_api/app/skills/builtin/code_ingestion/skill.py +267 -0
- control_plane_api/app/skills/builtin/cognitive_memory/__init__.py +4 -0
- control_plane_api/app/skills/builtin/cognitive_memory/skill.py +174 -0
- control_plane_api/app/skills/builtin/contextual_awareness/__init__.py +4 -0
- control_plane_api/app/skills/builtin/contextual_awareness/skill.py +387 -0
- control_plane_api/app/skills/builtin/data_visualization/__init__.py +4 -0
- control_plane_api/app/skills/builtin/data_visualization/skill.py +154 -0
- control_plane_api/app/skills/builtin/docker/__init__.py +4 -0
- control_plane_api/app/skills/builtin/docker/skill.py +104 -0
- control_plane_api/app/skills/builtin/file_generation/__init__.py +4 -0
- control_plane_api/app/skills/builtin/file_generation/skill.py +94 -0
- control_plane_api/app/skills/builtin/file_system/__init__.py +4 -0
- control_plane_api/app/skills/builtin/file_system/skill.py +110 -0
- control_plane_api/app/skills/builtin/knowledge_api/__init__.py +5 -0
- control_plane_api/app/skills/builtin/knowledge_api/skill.py +124 -0
- control_plane_api/app/skills/builtin/python/__init__.py +4 -0
- control_plane_api/app/skills/builtin/python/skill.py +92 -0
- control_plane_api/app/skills/builtin/remote_filesystem/__init__.py +5 -0
- control_plane_api/app/skills/builtin/remote_filesystem/skill.py +170 -0
- control_plane_api/app/skills/builtin/shell/__init__.py +4 -0
- control_plane_api/app/skills/builtin/shell/skill.py +161 -0
- control_plane_api/app/skills/builtin/slack/__init__.py +3 -0
- control_plane_api/app/skills/builtin/slack/skill.py +302 -0
- control_plane_api/app/skills/builtin/workflow_executor/__init__.py +4 -0
- control_plane_api/app/skills/builtin/workflow_executor/skill.py +469 -0
- control_plane_api/app/skills/business_intelligence.py +189 -0
- control_plane_api/app/skills/config.py +63 -0
- control_plane_api/app/skills/loaders/__init__.py +14 -0
- control_plane_api/app/skills/loaders/base.py +73 -0
- control_plane_api/app/skills/loaders/filesystem_loader.py +199 -0
- control_plane_api/app/skills/registry.py +125 -0
- control_plane_api/app/utils/helpers.py +12 -0
- control_plane_api/app/utils/workflow_executor.py +354 -0
- control_plane_api/app/workflows/__init__.py +11 -0
- control_plane_api/app/workflows/agent_execution.py +520 -0
- control_plane_api/app/workflows/agent_execution_with_skills.py +223 -0
- control_plane_api/app/workflows/namespace_provisioning.py +326 -0
- control_plane_api/app/workflows/plan_generation.py +254 -0
- control_plane_api/app/workflows/team_execution.py +442 -0
- control_plane_api/scripts/seed_models.py +240 -0
- control_plane_api/scripts/validate_existing_tool_names.py +492 -0
- control_plane_api/shared/__init__.py +8 -0
- control_plane_api/shared/version.py +17 -0
- control_plane_api/test_deduplication.py +274 -0
- control_plane_api/test_executor_deduplication_e2e.py +309 -0
- control_plane_api/test_job_execution_e2e.py +283 -0
- control_plane_api/test_real_integration.py +193 -0
- control_plane_api/version.py +38 -0
- control_plane_api/worker/__init__.py +0 -0
- control_plane_api/worker/activities/__init__.py +0 -0
- control_plane_api/worker/activities/agent_activities.py +1585 -0
- control_plane_api/worker/activities/approval_activities.py +234 -0
- control_plane_api/worker/activities/job_activities.py +199 -0
- control_plane_api/worker/activities/runtime_activities.py +1167 -0
- control_plane_api/worker/activities/skill_activities.py +282 -0
- control_plane_api/worker/activities/team_activities.py +479 -0
- control_plane_api/worker/agent_runtime_server.py +370 -0
- control_plane_api/worker/binary_manager.py +333 -0
- control_plane_api/worker/config/__init__.py +31 -0
- control_plane_api/worker/config/worker_config.py +273 -0
- control_plane_api/worker/control_plane_client.py +1491 -0
- control_plane_api/worker/examples/analytics_integration_example.py +362 -0
- control_plane_api/worker/health_monitor.py +159 -0
- control_plane_api/worker/metrics.py +237 -0
- control_plane_api/worker/models/__init__.py +1 -0
- control_plane_api/worker/models/error_events.py +105 -0
- control_plane_api/worker/models/inputs.py +89 -0
- control_plane_api/worker/runtimes/__init__.py +35 -0
- control_plane_api/worker/runtimes/agent_runtime/runtime.py +485 -0
- control_plane_api/worker/runtimes/agno/__init__.py +34 -0
- control_plane_api/worker/runtimes/agno/config.py +248 -0
- control_plane_api/worker/runtimes/agno/hooks.py +385 -0
- control_plane_api/worker/runtimes/agno/mcp_builder.py +195 -0
- control_plane_api/worker/runtimes/agno/runtime.py +1063 -0
- control_plane_api/worker/runtimes/agno/utils.py +163 -0
- control_plane_api/worker/runtimes/base.py +979 -0
- control_plane_api/worker/runtimes/claude_code/__init__.py +38 -0
- control_plane_api/worker/runtimes/claude_code/cleanup.py +184 -0
- control_plane_api/worker/runtimes/claude_code/client_pool.py +529 -0
- control_plane_api/worker/runtimes/claude_code/config.py +829 -0
- control_plane_api/worker/runtimes/claude_code/hooks.py +482 -0
- control_plane_api/worker/runtimes/claude_code/litellm_proxy.py +1702 -0
- control_plane_api/worker/runtimes/claude_code/mcp_builder.py +467 -0
- control_plane_api/worker/runtimes/claude_code/mcp_discovery.py +558 -0
- control_plane_api/worker/runtimes/claude_code/runtime.py +1546 -0
- control_plane_api/worker/runtimes/claude_code/tool_mapper.py +403 -0
- control_plane_api/worker/runtimes/claude_code/utils.py +149 -0
- control_plane_api/worker/runtimes/factory.py +173 -0
- control_plane_api/worker/runtimes/model_utils.py +107 -0
- control_plane_api/worker/runtimes/validation.py +93 -0
- control_plane_api/worker/services/__init__.py +1 -0
- control_plane_api/worker/services/agent_communication_tools.py +908 -0
- control_plane_api/worker/services/agent_executor.py +485 -0
- control_plane_api/worker/services/agent_executor_v2.py +793 -0
- control_plane_api/worker/services/analytics_collector.py +457 -0
- control_plane_api/worker/services/analytics_service.py +464 -0
- control_plane_api/worker/services/approval_tools.py +310 -0
- control_plane_api/worker/services/approval_tools_agno.py +207 -0
- control_plane_api/worker/services/cancellation_manager.py +177 -0
- control_plane_api/worker/services/code_ingestion_tools.py +465 -0
- control_plane_api/worker/services/contextual_awareness_tools.py +405 -0
- control_plane_api/worker/services/data_visualization.py +834 -0
- control_plane_api/worker/services/event_publisher.py +531 -0
- control_plane_api/worker/services/jira_tools.py +257 -0
- control_plane_api/worker/services/remote_filesystem_tools.py +498 -0
- control_plane_api/worker/services/runtime_analytics.py +328 -0
- control_plane_api/worker/services/session_service.py +365 -0
- control_plane_api/worker/services/skill_context_enhancement.py +181 -0
- control_plane_api/worker/services/skill_factory.py +471 -0
- control_plane_api/worker/services/system_prompt_enhancement.py +410 -0
- control_plane_api/worker/services/team_executor.py +715 -0
- control_plane_api/worker/services/team_executor_v2.py +1866 -0
- control_plane_api/worker/services/tool_enforcement.py +254 -0
- control_plane_api/worker/services/workflow_executor/__init__.py +52 -0
- control_plane_api/worker/services/workflow_executor/event_processor.py +287 -0
- control_plane_api/worker/services/workflow_executor/event_publisher.py +210 -0
- control_plane_api/worker/services/workflow_executor/executors/__init__.py +15 -0
- control_plane_api/worker/services/workflow_executor/executors/base.py +270 -0
- control_plane_api/worker/services/workflow_executor/executors/json_executor.py +50 -0
- control_plane_api/worker/services/workflow_executor/executors/python_executor.py +50 -0
- control_plane_api/worker/services/workflow_executor/models.py +142 -0
- control_plane_api/worker/services/workflow_executor_tools.py +1748 -0
- control_plane_api/worker/skills/__init__.py +12 -0
- control_plane_api/worker/skills/builtin/context_graph_search/README.md +213 -0
- control_plane_api/worker/skills/builtin/context_graph_search/__init__.py +5 -0
- control_plane_api/worker/skills/builtin/context_graph_search/agno_impl.py +808 -0
- control_plane_api/worker/skills/builtin/context_graph_search/skill.yaml +67 -0
- control_plane_api/worker/skills/builtin/contextual_awareness/__init__.py +4 -0
- control_plane_api/worker/skills/builtin/contextual_awareness/agno_impl.py +62 -0
- control_plane_api/worker/skills/builtin/data_visualization/agno_impl.py +18 -0
- control_plane_api/worker/skills/builtin/data_visualization/skill.yaml +84 -0
- control_plane_api/worker/skills/builtin/docker/agno_impl.py +65 -0
- control_plane_api/worker/skills/builtin/docker/skill.yaml +60 -0
- control_plane_api/worker/skills/builtin/file_generation/agno_impl.py +47 -0
- control_plane_api/worker/skills/builtin/file_generation/skill.yaml +64 -0
- control_plane_api/worker/skills/builtin/file_system/agno_impl.py +32 -0
- control_plane_api/worker/skills/builtin/file_system/skill.yaml +54 -0
- control_plane_api/worker/skills/builtin/knowledge_api/__init__.py +4 -0
- control_plane_api/worker/skills/builtin/knowledge_api/agno_impl.py +50 -0
- control_plane_api/worker/skills/builtin/knowledge_api/skill.yaml +66 -0
- control_plane_api/worker/skills/builtin/python/agno_impl.py +25 -0
- control_plane_api/worker/skills/builtin/python/skill.yaml +60 -0
- control_plane_api/worker/skills/builtin/schema_fix_mixin.py +260 -0
- control_plane_api/worker/skills/builtin/shell/agno_impl.py +31 -0
- control_plane_api/worker/skills/builtin/shell/skill.yaml +60 -0
- control_plane_api/worker/skills/builtin/slack/__init__.py +3 -0
- control_plane_api/worker/skills/builtin/slack/agno_impl.py +1282 -0
- control_plane_api/worker/skills/builtin/slack/skill.yaml +276 -0
- control_plane_api/worker/skills/builtin/workflow_executor/agno_impl.py +62 -0
- control_plane_api/worker/skills/builtin/workflow_executor/skill.yaml +79 -0
- control_plane_api/worker/skills/loaders/__init__.py +5 -0
- control_plane_api/worker/skills/loaders/base.py +23 -0
- control_plane_api/worker/skills/loaders/filesystem_loader.py +357 -0
- control_plane_api/worker/skills/registry.py +208 -0
- control_plane_api/worker/tests/__init__.py +1 -0
- control_plane_api/worker/tests/conftest.py +12 -0
- control_plane_api/worker/tests/e2e/__init__.py +0 -0
- control_plane_api/worker/tests/e2e/test_context_graph_real_api.py +338 -0
- control_plane_api/worker/tests/e2e/test_context_graph_templates_e2e.py +523 -0
- control_plane_api/worker/tests/e2e/test_enforcement_e2e.py +344 -0
- control_plane_api/worker/tests/e2e/test_execution_flow.py +571 -0
- control_plane_api/worker/tests/e2e/test_single_execution_mode.py +656 -0
- control_plane_api/worker/tests/integration/__init__.py +0 -0
- control_plane_api/worker/tests/integration/test_builtin_skills_fixes.py +245 -0
- control_plane_api/worker/tests/integration/test_context_graph_search_integration.py +365 -0
- control_plane_api/worker/tests/integration/test_control_plane_integration.py +308 -0
- control_plane_api/worker/tests/integration/test_hook_enforcement_integration.py +579 -0
- control_plane_api/worker/tests/integration/test_scheduled_job_workflow.py +237 -0
- control_plane_api/worker/tests/integration/test_system_prompt_enhancement_integration.py +343 -0
- control_plane_api/worker/tests/unit/__init__.py +0 -0
- control_plane_api/worker/tests/unit/test_builtin_skill_autoload.py +396 -0
- control_plane_api/worker/tests/unit/test_context_graph_search.py +450 -0
- control_plane_api/worker/tests/unit/test_context_graph_templates.py +403 -0
- control_plane_api/worker/tests/unit/test_control_plane_client.py +401 -0
- control_plane_api/worker/tests/unit/test_control_plane_client_jobs.py +345 -0
- control_plane_api/worker/tests/unit/test_job_activities.py +353 -0
- control_plane_api/worker/tests/unit/test_skill_context_enhancement.py +321 -0
- control_plane_api/worker/tests/unit/test_system_prompt_enhancement.py +415 -0
- control_plane_api/worker/tests/unit/test_tool_enforcement.py +324 -0
- control_plane_api/worker/utils/__init__.py +1 -0
- control_plane_api/worker/utils/chunk_batcher.py +330 -0
- control_plane_api/worker/utils/environment.py +65 -0
- control_plane_api/worker/utils/error_publisher.py +260 -0
- control_plane_api/worker/utils/event_batcher.py +256 -0
- control_plane_api/worker/utils/logging_config.py +335 -0
- control_plane_api/worker/utils/logging_helper.py +326 -0
- control_plane_api/worker/utils/parameter_validator.py +120 -0
- control_plane_api/worker/utils/retry_utils.py +60 -0
- control_plane_api/worker/utils/streaming_utils.py +665 -0
- control_plane_api/worker/utils/tool_validation.py +332 -0
- control_plane_api/worker/utils/workspace_manager.py +163 -0
- control_plane_api/worker/websocket_client.py +393 -0
- control_plane_api/worker/worker.py +1297 -0
- control_plane_api/worker/workflows/__init__.py +0 -0
- control_plane_api/worker/workflows/agent_execution.py +909 -0
- control_plane_api/worker/workflows/scheduled_job_wrapper.py +332 -0
- control_plane_api/worker/workflows/team_execution.py +611 -0
- kubiya_control_plane_api-0.9.15.dist-info/METADATA +354 -0
- kubiya_control_plane_api-0.9.15.dist-info/RECORD +479 -0
- kubiya_control_plane_api-0.9.15.dist-info/WHEEL +5 -0
- kubiya_control_plane_api-0.9.15.dist-info/entry_points.txt +5 -0
- kubiya_control_plane_api-0.9.15.dist-info/licenses/LICENSE +676 -0
- kubiya_control_plane_api-0.9.15.dist-info/top_level.txt +3 -0
- scripts/__init__.py +1 -0
- scripts/migrations.py +39 -0
- scripts/seed_worker_queues.py +128 -0
- scripts/setup_agent_runtime.py +142 -0
- worker_internal/__init__.py +1 -0
- worker_internal/planner/__init__.py +1 -0
- worker_internal/planner/activities.py +1499 -0
- worker_internal/planner/agent_tools.py +197 -0
- worker_internal/planner/event_models.py +148 -0
- worker_internal/planner/event_publisher.py +67 -0
- worker_internal/planner/models.py +199 -0
- worker_internal/planner/retry_logic.py +134 -0
- worker_internal/planner/worker.py +300 -0
- worker_internal/planner/workflows.py +970 -0
|
@@ -0,0 +1,2555 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Worker Queues router - Manage worker queues within environments.
|
|
3
|
+
|
|
4
|
+
Each environment can have multiple worker queues for fine-grained worker management.
|
|
5
|
+
Task queue naming: {org_id}.{environment_name}.{worker_queue_name}
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from fastapi import APIRouter, Depends, HTTPException, status, Request
|
|
9
|
+
from fastapi.responses import PlainTextResponse
|
|
10
|
+
from typing import List, Optional, Literal, Dict
|
|
11
|
+
from datetime import datetime, timedelta, timezone
|
|
12
|
+
from pydantic import BaseModel, Field, field_validator
|
|
13
|
+
import structlog
|
|
14
|
+
import uuid
|
|
15
|
+
import os
|
|
16
|
+
import json
|
|
17
|
+
import hashlib
|
|
18
|
+
|
|
19
|
+
from control_plane_api.app.utils.helpers import is_local_temporal
|
|
20
|
+
from control_plane_api.app.middleware.auth import get_current_organization
|
|
21
|
+
from control_plane_api.app.lib.redis_client import get_redis_client
|
|
22
|
+
from control_plane_api.app.database import get_db
|
|
23
|
+
from control_plane_api.app.models.worker import WorkerQueue, WorkerHeartbeat
|
|
24
|
+
from control_plane_api.app.models.environment import Environment
|
|
25
|
+
from control_plane_api.app.models.execution import Execution
|
|
26
|
+
from control_plane_api.app.config import settings
|
|
27
|
+
from control_plane_api.app.schemas.worker_queue_observability_schemas import (
|
|
28
|
+
WorkerQueueMetricsResponse,
|
|
29
|
+
WorkflowsListResponse
|
|
30
|
+
)
|
|
31
|
+
from control_plane_api.app.services.worker_queue_metrics_service import WorkerQueueMetricsService
|
|
32
|
+
from sqlalchemy.orm import Session, joinedload
|
|
33
|
+
from sqlalchemy import desc
|
|
34
|
+
from control_plane_api.app.lib.environment import detect_environment
|
|
35
|
+
from control_plane_api.app.observability import (
|
|
36
|
+
instrument_endpoint,
|
|
37
|
+
create_span_with_context,
|
|
38
|
+
add_span_event,
|
|
39
|
+
add_span_error,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
logger = structlog.get_logger()
|
|
43
|
+
|
|
44
|
+
router = APIRouter()
|
|
45
|
+
|
|
46
|
+
# Stale worker threshold: Must be >= 2x the heartbeat interval to avoid false negatives
|
|
47
|
+
# Default heartbeat interval is 60s, so threshold is 120s (2x) plus 30s grace period
|
|
48
|
+
# Workers that haven't sent heartbeat in 150s are considered inactive
|
|
49
|
+
STALE_WORKER_THRESHOLD_SECONDS = 150
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# LiteLLM Configuration Schemas
|
|
53
|
+
class LiteLLMModelConfig(BaseModel):
|
|
54
|
+
"""Single model configuration for LiteLLM proxy"""
|
|
55
|
+
model_name: str = Field(..., description="User-facing model name (e.g., gpt-4)")
|
|
56
|
+
litellm_params: dict = Field(..., description="Parameters passed to litellm.completion() including model, api_base, api_key, etc.")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class LiteLLMConfig(BaseModel):
|
|
60
|
+
"""Complete LiteLLM proxy configuration for local worker proxy"""
|
|
61
|
+
model_list: List[LiteLLMModelConfig] = Field(..., description="List of models to configure in the local proxy")
|
|
62
|
+
litellm_settings: Optional[dict] = Field(None, description="LiteLLM settings (callbacks, rate limits, etc.)")
|
|
63
|
+
environment_variables: Optional[dict] = Field(None, description="Environment variables for the proxy (Langfuse keys, etc.)")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class QueueSettings(BaseModel):
|
|
67
|
+
"""Worker queue settings schema with validation"""
|
|
68
|
+
enable_local_litellm_proxy: bool = Field(False, description="Enable local LiteLLM proxy for this queue")
|
|
69
|
+
litellm_config: Optional[LiteLLMConfig] = Field(None, description="LiteLLM proxy configuration (required if enable_local_litellm_proxy is true)")
|
|
70
|
+
local_proxy_timeout_seconds: int = Field(10, ge=5, le=60, description="Proxy startup timeout in seconds")
|
|
71
|
+
local_proxy_max_retries: int = Field(3, ge=1, le=10, description="Maximum retry attempts for proxy startup")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def get_active_workers_from_redis(org_id: str, queue_id: Optional[str] = None, db: Session = None) -> dict:
|
|
75
|
+
"""
|
|
76
|
+
Get active workers from Redis heartbeats.
|
|
77
|
+
|
|
78
|
+
Redis heartbeats have automatic TTL (5 minutes), so if a worker hasn't sent a heartbeat
|
|
79
|
+
the key will automatically expire. This eliminates the need to manually mark workers as stale.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
org_id: Organization ID
|
|
83
|
+
queue_id: Optional queue ID to filter by
|
|
84
|
+
db: Database session (optional)
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Dict with worker_id -> heartbeat_data mapping
|
|
88
|
+
"""
|
|
89
|
+
redis_client = get_redis_client()
|
|
90
|
+
|
|
91
|
+
if not redis_client:
|
|
92
|
+
logger.warning("redis_unavailable_for_worker_query", org_id=org_id)
|
|
93
|
+
return {}
|
|
94
|
+
|
|
95
|
+
# If no session provided, create one
|
|
96
|
+
should_close_db = False
|
|
97
|
+
if db is None:
|
|
98
|
+
from control_plane_api.app.database import get_session_local
|
|
99
|
+
SessionLocal = get_session_local()
|
|
100
|
+
db = SessionLocal()
|
|
101
|
+
should_close_db = True
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
# Get all worker heartbeat keys for this org
|
|
105
|
+
# We need to get worker records from DB to map worker_id -> queue_id
|
|
106
|
+
workers_db = db.query(WorkerHeartbeat).filter(
|
|
107
|
+
WorkerHeartbeat.organization_id == org_id
|
|
108
|
+
).all()
|
|
109
|
+
|
|
110
|
+
if not workers_db:
|
|
111
|
+
return {}
|
|
112
|
+
|
|
113
|
+
# Filter workers by queue_id if specified
|
|
114
|
+
workers_to_check = []
|
|
115
|
+
worker_queue_map = {}
|
|
116
|
+
# Also track registered_at times (as timezone-aware datetimes)
|
|
117
|
+
worker_registered_at = {}
|
|
118
|
+
for worker in workers_db:
|
|
119
|
+
worker_id = str(worker.id)
|
|
120
|
+
worker_queue_id = str(worker.worker_queue_id) if worker.worker_queue_id else None
|
|
121
|
+
|
|
122
|
+
# Skip if queue_id filter is specified and doesn't match
|
|
123
|
+
if queue_id and worker_queue_id != queue_id:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
workers_to_check.append(worker_id)
|
|
127
|
+
worker_queue_map[worker_id] = worker_queue_id
|
|
128
|
+
# Ensure registered_at is timezone-aware for any future comparisons
|
|
129
|
+
if worker.registered_at:
|
|
130
|
+
reg_at = worker.registered_at
|
|
131
|
+
if reg_at.tzinfo is None:
|
|
132
|
+
reg_at = reg_at.replace(tzinfo=timezone.utc)
|
|
133
|
+
worker_registered_at[worker_id] = reg_at
|
|
134
|
+
|
|
135
|
+
if not workers_to_check:
|
|
136
|
+
return {}
|
|
137
|
+
|
|
138
|
+
# Batch fetch all heartbeats in a single Redis pipeline request
|
|
139
|
+
redis_keys = [f"worker:{worker_id}:heartbeat" for worker_id in workers_to_check]
|
|
140
|
+
heartbeat_results = await redis_client.mget(redis_keys)
|
|
141
|
+
|
|
142
|
+
# Process results
|
|
143
|
+
active_workers = {}
|
|
144
|
+
now_utc = datetime.now(timezone.utc) # Pre-compute timezone-aware now
|
|
145
|
+
|
|
146
|
+
for worker_id in workers_to_check:
|
|
147
|
+
redis_key = f"worker:{worker_id}:heartbeat"
|
|
148
|
+
heartbeat_data = heartbeat_results.get(redis_key)
|
|
149
|
+
|
|
150
|
+
if heartbeat_data:
|
|
151
|
+
try:
|
|
152
|
+
data = json.loads(heartbeat_data)
|
|
153
|
+
# Check if heartbeat is recent (within threshold)
|
|
154
|
+
last_heartbeat_str = data.get("last_heartbeat", "")
|
|
155
|
+
if not last_heartbeat_str:
|
|
156
|
+
logger.warning("missing_last_heartbeat", worker_id=worker_id)
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# Handle ISO format with 'Z' suffix (Python < 3.11 doesn't handle 'Z')
|
|
160
|
+
if last_heartbeat_str.endswith('Z'):
|
|
161
|
+
last_heartbeat_str = last_heartbeat_str[:-1] + '+00:00'
|
|
162
|
+
|
|
163
|
+
last_heartbeat = datetime.fromisoformat(last_heartbeat_str)
|
|
164
|
+
|
|
165
|
+
# Ensure timezone-aware datetime
|
|
166
|
+
if last_heartbeat.tzinfo is None:
|
|
167
|
+
last_heartbeat = last_heartbeat.replace(tzinfo=timezone.utc)
|
|
168
|
+
|
|
169
|
+
# Calculate age - convert both to timestamps to avoid timezone issues
|
|
170
|
+
try:
|
|
171
|
+
now_ts = datetime.now(timezone.utc).timestamp()
|
|
172
|
+
# Convert last_heartbeat to timestamp
|
|
173
|
+
if last_heartbeat.tzinfo is None:
|
|
174
|
+
last_heartbeat = last_heartbeat.replace(tzinfo=timezone.utc)
|
|
175
|
+
hb_ts = last_heartbeat.timestamp()
|
|
176
|
+
age_seconds = now_ts - hb_ts
|
|
177
|
+
except (TypeError, AttributeError, OSError) as dt_err:
|
|
178
|
+
# If datetime comparison fails, skip this worker
|
|
179
|
+
logger.warning("datetime_comparison_failed", worker_id=worker_id, error=str(dt_err))
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
if age_seconds <= STALE_WORKER_THRESHOLD_SECONDS:
|
|
183
|
+
active_workers[worker_id] = {
|
|
184
|
+
**data,
|
|
185
|
+
"worker_queue_id": worker_queue_map[worker_id],
|
|
186
|
+
}
|
|
187
|
+
else:
|
|
188
|
+
logger.debug(
|
|
189
|
+
"worker_heartbeat_stale",
|
|
190
|
+
worker_id=worker_id,
|
|
191
|
+
age_seconds=age_seconds,
|
|
192
|
+
threshold=STALE_WORKER_THRESHOLD_SECONDS
|
|
193
|
+
)
|
|
194
|
+
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
|
195
|
+
logger.warning("invalid_heartbeat_data", worker_id=worker_id, error=str(e))
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
logger.debug(
|
|
199
|
+
"active_workers_fetched",
|
|
200
|
+
org_id=org_id,
|
|
201
|
+
total_workers=len(workers_to_check),
|
|
202
|
+
active_workers=len(active_workers),
|
|
203
|
+
queue_id=queue_id,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return active_workers
|
|
207
|
+
|
|
208
|
+
except Exception as e:
|
|
209
|
+
import traceback
|
|
210
|
+
logger.error(
|
|
211
|
+
"failed_to_get_active_workers_from_redis",
|
|
212
|
+
error=str(e),
|
|
213
|
+
org_id=org_id,
|
|
214
|
+
error_type=type(e).__name__,
|
|
215
|
+
line_info=traceback.format_exc().split("\n")[-3] if traceback.format_exc() else "unknown",
|
|
216
|
+
)
|
|
217
|
+
return {}
|
|
218
|
+
finally:
|
|
219
|
+
if should_close_db and db:
|
|
220
|
+
db.close()
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# Pydantic schemas
|
|
224
|
+
class WorkerQueueCreate(BaseModel):
|
|
225
|
+
name: str = Field(..., min_length=2, max_length=50, description="Worker queue name (lowercase, no spaces)")
|
|
226
|
+
display_name: Optional[str] = Field(None, description="User-friendly display name")
|
|
227
|
+
description: Optional[str] = Field(None, description="Queue description")
|
|
228
|
+
max_workers: Optional[int] = Field(None, ge=1, description="Max workers allowed (NULL = unlimited)")
|
|
229
|
+
heartbeat_interval: int = Field(60, ge=10, le=300, description="Seconds between heartbeats (lightweight)")
|
|
230
|
+
tags: List[str] = Field(default_factory=list)
|
|
231
|
+
settings: dict = Field(default_factory=dict)
|
|
232
|
+
|
|
233
|
+
@field_validator("settings")
|
|
234
|
+
def validate_settings(cls, v):
|
|
235
|
+
"""Validate settings structure including litellm_config"""
|
|
236
|
+
if not v:
|
|
237
|
+
return v
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
# Validate entire settings dict using QueueSettings schema
|
|
241
|
+
QueueSettings(**v)
|
|
242
|
+
except Exception as e:
|
|
243
|
+
raise ValueError(f"Invalid settings: {str(e)}")
|
|
244
|
+
|
|
245
|
+
# Additional validation: if enable_local_litellm_proxy is true, litellm_config is required
|
|
246
|
+
if v.get("enable_local_litellm_proxy") and not v.get("litellm_config"):
|
|
247
|
+
raise ValueError("litellm_config is required when enable_local_litellm_proxy is true")
|
|
248
|
+
|
|
249
|
+
return v
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class WorkerQueueUpdate(BaseModel):
|
|
253
|
+
name: Optional[str] = Field(None, min_length=2, max_length=50)
|
|
254
|
+
display_name: Optional[str] = None
|
|
255
|
+
description: Optional[str] = None
|
|
256
|
+
status: Optional[str] = None
|
|
257
|
+
max_workers: Optional[int] = Field(None, ge=1)
|
|
258
|
+
heartbeat_interval: Optional[int] = Field(None, ge=10, le=300)
|
|
259
|
+
tags: Optional[List[str]] = None
|
|
260
|
+
settings: Optional[dict] = None
|
|
261
|
+
|
|
262
|
+
@field_validator("settings")
|
|
263
|
+
def validate_settings(cls, v):
|
|
264
|
+
"""Validate settings structure including litellm_config"""
|
|
265
|
+
if not v:
|
|
266
|
+
return v
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
# Validate entire settings dict using QueueSettings schema
|
|
270
|
+
QueueSettings(**v)
|
|
271
|
+
except Exception as e:
|
|
272
|
+
raise ValueError(f"Invalid settings: {str(e)}")
|
|
273
|
+
|
|
274
|
+
# Additional validation: if enable_local_litellm_proxy is true, litellm_config is required
|
|
275
|
+
if v.get("enable_local_litellm_proxy") and not v.get("litellm_config"):
|
|
276
|
+
raise ValueError("litellm_config is required when enable_local_litellm_proxy is true")
|
|
277
|
+
|
|
278
|
+
return v
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class WorkerQueueResponse(BaseModel):
|
|
282
|
+
id: str
|
|
283
|
+
organization_id: str
|
|
284
|
+
environment_id: str
|
|
285
|
+
name: str
|
|
286
|
+
display_name: Optional[str]
|
|
287
|
+
description: Optional[str]
|
|
288
|
+
status: str
|
|
289
|
+
max_workers: Optional[int]
|
|
290
|
+
heartbeat_interval: int
|
|
291
|
+
tags: List[str]
|
|
292
|
+
settings: dict
|
|
293
|
+
created_at: datetime
|
|
294
|
+
updated_at: datetime
|
|
295
|
+
created_by: Optional[str]
|
|
296
|
+
# Computed
|
|
297
|
+
active_workers: int = 0
|
|
298
|
+
task_queue_name: str # Full task queue name: org.env.worker_queue
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
@field_validator("id", "environment_id", "created_by", mode="before")
|
|
302
|
+
def cast_to_string(cls, v):
|
|
303
|
+
if v is None:
|
|
304
|
+
return None
|
|
305
|
+
return str(v)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
@router.get("/worker-queues", response_model=List[WorkerQueueResponse])
|
|
309
|
+
@instrument_endpoint("worker_queues.list_all_worker_queues")
|
|
310
|
+
async def list_all_worker_queues(
|
|
311
|
+
request: Request,
|
|
312
|
+
organization: dict = Depends(get_current_organization),
|
|
313
|
+
db: Session = Depends(get_db),
|
|
314
|
+
):
|
|
315
|
+
"""List all worker queues across all environments for the organization (excluding ephemeral queues)"""
|
|
316
|
+
try:
|
|
317
|
+
org_id = organization["id"]
|
|
318
|
+
|
|
319
|
+
# Get all non-ephemeral worker queues for this organization with environment relationship
|
|
320
|
+
# Also exclude queues starting with "local-exec" (ephemeral local execution queues)
|
|
321
|
+
queues_db = (
|
|
322
|
+
db.query(WorkerQueue)
|
|
323
|
+
.options(joinedload(WorkerQueue.environment))
|
|
324
|
+
.filter(
|
|
325
|
+
WorkerQueue.organization_id == org_id,
|
|
326
|
+
WorkerQueue.ephemeral == False, # Exclude ephemeral queues
|
|
327
|
+
~WorkerQueue.name.startswith('local-exec') # Exclude local-exec queues
|
|
328
|
+
)
|
|
329
|
+
.order_by(WorkerQueue.created_at.asc())
|
|
330
|
+
.all()
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
if not queues_db:
|
|
334
|
+
return []
|
|
335
|
+
|
|
336
|
+
# Get active workers from Redis (with automatic TTL-based expiration)
|
|
337
|
+
active_workers = await get_active_workers_from_redis(org_id, db=db)
|
|
338
|
+
|
|
339
|
+
# Count workers per queue
|
|
340
|
+
worker_counts = {}
|
|
341
|
+
for worker_id, worker_data in active_workers.items():
|
|
342
|
+
queue_id = worker_data.get("worker_queue_id")
|
|
343
|
+
if queue_id:
|
|
344
|
+
worker_counts[queue_id] = worker_counts.get(queue_id, 0) + 1
|
|
345
|
+
|
|
346
|
+
# Build response
|
|
347
|
+
queues = []
|
|
348
|
+
for queue in queues_db:
|
|
349
|
+
# Use queue UUID as task queue name for security
|
|
350
|
+
task_queue_name = str(queue.id)
|
|
351
|
+
active_worker_count = worker_counts.get(str(queue.id), 0)
|
|
352
|
+
|
|
353
|
+
# Get environment name from relationship
|
|
354
|
+
environment_name = queue.environment.name if queue.environment else None
|
|
355
|
+
|
|
356
|
+
from sqlalchemy.inspection import inspect
|
|
357
|
+
queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
|
|
358
|
+
|
|
359
|
+
queues.append(
|
|
360
|
+
WorkerQueueResponse(
|
|
361
|
+
**queue_dict,
|
|
362
|
+
active_workers=active_worker_count,
|
|
363
|
+
task_queue_name=task_queue_name,
|
|
364
|
+
environment_name=environment_name,
|
|
365
|
+
)
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
logger.info(
|
|
369
|
+
"all_worker_queues_listed",
|
|
370
|
+
count=len(queues),
|
|
371
|
+
org_id=org_id,
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
return queues
|
|
375
|
+
|
|
376
|
+
except HTTPException:
|
|
377
|
+
raise
|
|
378
|
+
except Exception as e:
|
|
379
|
+
logger.error("all_worker_queues_list_failed", error=str(e), org_id=org_id)
|
|
380
|
+
raise HTTPException(
|
|
381
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
382
|
+
detail=f"Failed to list all worker queues: {str(e)}"
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
@router.post("/environments/{environment_id}/worker-queues", response_model=WorkerQueueResponse, status_code=status.HTTP_201_CREATED)
|
|
387
|
+
@instrument_endpoint("worker_queues.create_worker_queue")
|
|
388
|
+
async def create_worker_queue(
|
|
389
|
+
environment_id: str,
|
|
390
|
+
queue_data: WorkerQueueCreate,
|
|
391
|
+
request: Request,
|
|
392
|
+
organization: dict = Depends(get_current_organization),
|
|
393
|
+
db: Session = Depends(get_db),
|
|
394
|
+
):
|
|
395
|
+
"""Create a new worker queue within an environment"""
|
|
396
|
+
try:
|
|
397
|
+
org_id = organization["id"]
|
|
398
|
+
|
|
399
|
+
# Validate environment exists
|
|
400
|
+
environment = (
|
|
401
|
+
db.query(Environment)
|
|
402
|
+
.filter(Environment.id == environment_id, Environment.organization_id == org_id)
|
|
403
|
+
.first()
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
if not environment:
|
|
407
|
+
raise HTTPException(
|
|
408
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
409
|
+
detail="Environment not found"
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# Check if worker queue name already exists in this environment
|
|
413
|
+
existing = (
|
|
414
|
+
db.query(WorkerQueue)
|
|
415
|
+
.filter(
|
|
416
|
+
WorkerQueue.environment_id == environment_id,
|
|
417
|
+
WorkerQueue.name == queue_data.name
|
|
418
|
+
)
|
|
419
|
+
.first()
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
if existing:
|
|
423
|
+
raise HTTPException(
|
|
424
|
+
status_code=status.HTTP_409_CONFLICT,
|
|
425
|
+
detail=f"Worker queue '{queue_data.name}' already exists in this environment"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Create worker queue
|
|
429
|
+
queue_id = str(uuid.uuid4())
|
|
430
|
+
now = datetime.now(timezone.utc)
|
|
431
|
+
|
|
432
|
+
# Automatically mark as ephemeral if name starts with "local-exec"
|
|
433
|
+
is_ephemeral = queue_data.name.startswith("local-exec")
|
|
434
|
+
|
|
435
|
+
queue = WorkerQueue(
|
|
436
|
+
id=queue_id,
|
|
437
|
+
organization_id=org_id,
|
|
438
|
+
environment_id=environment_id,
|
|
439
|
+
name=queue_data.name,
|
|
440
|
+
display_name=queue_data.display_name or queue_data.name,
|
|
441
|
+
description=queue_data.description,
|
|
442
|
+
status="active",
|
|
443
|
+
max_workers=queue_data.max_workers,
|
|
444
|
+
heartbeat_interval=queue_data.heartbeat_interval,
|
|
445
|
+
tags=queue_data.tags,
|
|
446
|
+
settings=queue_data.settings,
|
|
447
|
+
ephemeral=is_ephemeral,
|
|
448
|
+
created_at=now,
|
|
449
|
+
updated_at=now,
|
|
450
|
+
created_by=organization.get("user_id"),
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
db.add(queue)
|
|
454
|
+
db.commit()
|
|
455
|
+
db.refresh(queue)
|
|
456
|
+
|
|
457
|
+
# Convert to dict for Pydantic response
|
|
458
|
+
from sqlalchemy.inspection import inspect
|
|
459
|
+
queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
|
|
460
|
+
|
|
461
|
+
# Use queue UUID as task queue name for security (unpredictable)
|
|
462
|
+
task_queue_name = queue_id
|
|
463
|
+
|
|
464
|
+
logger.info(
|
|
465
|
+
"worker_queue_created",
|
|
466
|
+
queue_id=queue_id,
|
|
467
|
+
queue_name=queue.name,
|
|
468
|
+
environment_id=environment_id,
|
|
469
|
+
task_queue_name=task_queue_name,
|
|
470
|
+
org_id=org_id,
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
return WorkerQueueResponse(
|
|
474
|
+
**queue_dict,
|
|
475
|
+
active_workers=0,
|
|
476
|
+
task_queue_name=task_queue_name,
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
except HTTPException:
|
|
480
|
+
raise
|
|
481
|
+
except Exception as e:
|
|
482
|
+
logger.error("worker_queue_creation_failed", error=str(e), org_id=organization["id"])
|
|
483
|
+
raise HTTPException(
|
|
484
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
485
|
+
detail=f"Failed to create worker queue: {str(e)}"
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
@router.get("/environments/{environment_id}/worker-queues", response_model=List[WorkerQueueResponse])
|
|
490
|
+
@instrument_endpoint("worker_queues.list_worker_queues")
|
|
491
|
+
async def list_worker_queues(
|
|
492
|
+
environment_id: str,
|
|
493
|
+
request: Request,
|
|
494
|
+
organization: dict = Depends(get_current_organization),
|
|
495
|
+
db: Session = Depends(get_db),
|
|
496
|
+
):
|
|
497
|
+
"""List all worker queues in an environment (excluding ephemeral queues)"""
|
|
498
|
+
try:
|
|
499
|
+
org_id = organization["id"]
|
|
500
|
+
|
|
501
|
+
# Get environment name
|
|
502
|
+
environment = (
|
|
503
|
+
db.query(Environment)
|
|
504
|
+
.filter(Environment.id == environment_id, Environment.organization_id == org_id)
|
|
505
|
+
.first()
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
if not environment:
|
|
509
|
+
raise HTTPException(
|
|
510
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
511
|
+
detail="Environment not found"
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
environment_name = environment.name
|
|
515
|
+
|
|
516
|
+
# Get non-ephemeral worker queues only
|
|
517
|
+
# Also exclude queues starting with "local-exec" (ephemeral local execution queues)
|
|
518
|
+
queues_db = (
|
|
519
|
+
db.query(WorkerQueue)
|
|
520
|
+
.filter(
|
|
521
|
+
WorkerQueue.environment_id == environment_id,
|
|
522
|
+
WorkerQueue.ephemeral == False, # Exclude ephemeral queues
|
|
523
|
+
~WorkerQueue.name.startswith('local-exec') # Exclude local-exec queues
|
|
524
|
+
)
|
|
525
|
+
.order_by(WorkerQueue.created_at.asc())
|
|
526
|
+
.all()
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
if not queues_db:
|
|
530
|
+
return []
|
|
531
|
+
|
|
532
|
+
# Get active workers from Redis (with automatic TTL-based expiration)
|
|
533
|
+
active_workers = await get_active_workers_from_redis(org_id, db=db)
|
|
534
|
+
|
|
535
|
+
# Count workers per queue
|
|
536
|
+
worker_counts = {}
|
|
537
|
+
for worker_id, worker_data in active_workers.items():
|
|
538
|
+
queue_id = worker_data.get("worker_queue_id")
|
|
539
|
+
if queue_id:
|
|
540
|
+
worker_counts[queue_id] = worker_counts.get(queue_id, 0) + 1
|
|
541
|
+
|
|
542
|
+
# Build response
|
|
543
|
+
queues = []
|
|
544
|
+
for queue in queues_db:
|
|
545
|
+
# Use queue UUID as task queue name for security
|
|
546
|
+
task_queue_name = str(queue.id)
|
|
547
|
+
active_worker_count = worker_counts.get(str(queue.id), 0)
|
|
548
|
+
|
|
549
|
+
from sqlalchemy.inspection import inspect
|
|
550
|
+
queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
|
|
551
|
+
|
|
552
|
+
queues.append(
|
|
553
|
+
WorkerQueueResponse(
|
|
554
|
+
**queue_dict,
|
|
555
|
+
active_workers=active_worker_count,
|
|
556
|
+
task_queue_name=task_queue_name,
|
|
557
|
+
)
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
logger.info(
|
|
561
|
+
"worker_queues_listed",
|
|
562
|
+
count=len(queues),
|
|
563
|
+
environment_id=environment_id,
|
|
564
|
+
org_id=org_id,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
return queues
|
|
568
|
+
|
|
569
|
+
except HTTPException:
|
|
570
|
+
raise
|
|
571
|
+
except Exception as e:
|
|
572
|
+
logger.error("worker_queues_list_failed", error=str(e), environment_id=environment_id)
|
|
573
|
+
raise HTTPException(
|
|
574
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
575
|
+
detail=f"Failed to list worker queues: {str(e)}"
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
@router.get("/worker-queues/{queue_id}", response_model=WorkerQueueResponse)
|
|
580
|
+
@instrument_endpoint("worker_queues.get_worker_queue")
|
|
581
|
+
async def get_worker_queue(
|
|
582
|
+
queue_id: str,
|
|
583
|
+
request: Request,
|
|
584
|
+
organization: dict = Depends(get_current_organization),
|
|
585
|
+
db: Session = Depends(get_db),
|
|
586
|
+
):
|
|
587
|
+
"""Get a specific worker queue by ID"""
|
|
588
|
+
try:
|
|
589
|
+
org_id = organization["id"]
|
|
590
|
+
|
|
591
|
+
# Get worker queue with environment relationship
|
|
592
|
+
queue = (
|
|
593
|
+
db.query(WorkerQueue)
|
|
594
|
+
.options(joinedload(WorkerQueue.environment))
|
|
595
|
+
.filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
|
|
596
|
+
.first()
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
if not queue:
|
|
600
|
+
raise HTTPException(
|
|
601
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
602
|
+
detail="Worker queue not found"
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
# Get environment name from relationship
|
|
606
|
+
environment_name = queue.environment.name if queue.environment else "unknown"
|
|
607
|
+
|
|
608
|
+
# Get active workers from Redis for this specific queue
|
|
609
|
+
active_workers_dict = await get_active_workers_from_redis(org_id, queue_id, db=db)
|
|
610
|
+
active_worker_count = len(active_workers_dict)
|
|
611
|
+
|
|
612
|
+
# Convert to dict for Pydantic response
|
|
613
|
+
from sqlalchemy.inspection import inspect
|
|
614
|
+
queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
|
|
615
|
+
|
|
616
|
+
# Use queue UUID as task queue name for security
|
|
617
|
+
task_queue_name = queue_id
|
|
618
|
+
|
|
619
|
+
return WorkerQueueResponse(
|
|
620
|
+
**queue_dict,
|
|
621
|
+
active_workers=active_worker_count,
|
|
622
|
+
task_queue_name=task_queue_name,
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
except HTTPException:
|
|
626
|
+
raise
|
|
627
|
+
except Exception as e:
|
|
628
|
+
logger.error("worker_queue_get_failed", error=str(e), queue_id=queue_id)
|
|
629
|
+
raise HTTPException(
|
|
630
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
631
|
+
detail=f"Failed to get worker queue: {str(e)}"
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
@router.patch("/worker-queues/{queue_id}", response_model=WorkerQueueResponse)
|
|
636
|
+
@instrument_endpoint("worker_queues.update_worker_queue")
|
|
637
|
+
async def update_worker_queue(
|
|
638
|
+
queue_id: str,
|
|
639
|
+
queue_data: WorkerQueueUpdate,
|
|
640
|
+
request: Request,
|
|
641
|
+
organization: dict = Depends(get_current_organization),
|
|
642
|
+
db: Session = Depends(get_db),
|
|
643
|
+
):
|
|
644
|
+
"""Update a worker queue"""
|
|
645
|
+
try:
|
|
646
|
+
org_id = organization["id"]
|
|
647
|
+
|
|
648
|
+
# Check if queue exists and get it with environment relationship
|
|
649
|
+
queue = (
|
|
650
|
+
db.query(WorkerQueue)
|
|
651
|
+
.options(joinedload(WorkerQueue.environment))
|
|
652
|
+
.filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
|
|
653
|
+
.first()
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
if not queue:
|
|
657
|
+
raise HTTPException(
|
|
658
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
659
|
+
detail="Worker queue not found"
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
# Build update dict and apply updates
|
|
663
|
+
update_data = queue_data.model_dump(exclude_unset=True)
|
|
664
|
+
for key, value in update_data.items():
|
|
665
|
+
setattr(queue, key, value)
|
|
666
|
+
|
|
667
|
+
queue.updated_at = datetime.now(timezone.utc)
|
|
668
|
+
|
|
669
|
+
db.commit()
|
|
670
|
+
db.refresh(queue)
|
|
671
|
+
|
|
672
|
+
# Get environment name from relationship
|
|
673
|
+
environment_name = queue.environment.name if queue.environment else "unknown"
|
|
674
|
+
|
|
675
|
+
# Get active workers from Redis for this specific queue
|
|
676
|
+
active_workers_dict = await get_active_workers_from_redis(org_id, queue_id, db=db)
|
|
677
|
+
active_worker_count = len(active_workers_dict)
|
|
678
|
+
|
|
679
|
+
# Convert to dict for Pydantic response
|
|
680
|
+
from sqlalchemy.inspection import inspect
|
|
681
|
+
queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
|
|
682
|
+
|
|
683
|
+
# Use queue UUID as task queue name for security
|
|
684
|
+
task_queue_name = queue_id
|
|
685
|
+
|
|
686
|
+
logger.info(
|
|
687
|
+
"worker_queue_updated",
|
|
688
|
+
queue_id=queue_id,
|
|
689
|
+
org_id=org_id,
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
return WorkerQueueResponse(
|
|
693
|
+
**queue_dict,
|
|
694
|
+
active_workers=active_worker_count,
|
|
695
|
+
task_queue_name=task_queue_name,
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
except HTTPException:
|
|
699
|
+
raise
|
|
700
|
+
except Exception as e:
|
|
701
|
+
logger.error("worker_queue_update_failed", error=str(e), queue_id=queue_id)
|
|
702
|
+
raise HTTPException(
|
|
703
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
704
|
+
detail=f"Failed to update worker queue: {str(e)}"
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
@router.delete("/worker-queues/{queue_id}", status_code=status.HTTP_204_NO_CONTENT)
|
|
709
|
+
@instrument_endpoint("worker_queues.delete_worker_queue")
|
|
710
|
+
async def delete_worker_queue(
|
|
711
|
+
queue_id: str,
|
|
712
|
+
request: Request,
|
|
713
|
+
organization: dict = Depends(get_current_organization),
|
|
714
|
+
db: Session = Depends(get_db),
|
|
715
|
+
):
|
|
716
|
+
"""Delete a worker queue"""
|
|
717
|
+
try:
|
|
718
|
+
org_id = organization["id"]
|
|
719
|
+
|
|
720
|
+
# Prevent deleting default queue and check if queue exists
|
|
721
|
+
queue = (
|
|
722
|
+
db.query(WorkerQueue)
|
|
723
|
+
.filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
|
|
724
|
+
.first()
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
if not queue:
|
|
728
|
+
raise HTTPException(
|
|
729
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
730
|
+
detail="Worker queue not found"
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
if queue.name == "default":
|
|
734
|
+
raise HTTPException(
|
|
735
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
736
|
+
detail="Cannot delete the default worker queue"
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
# Check for active workers in Redis
|
|
740
|
+
active_workers = await get_active_workers_from_redis(org_id, queue_id, db=db)
|
|
741
|
+
|
|
742
|
+
if active_workers:
|
|
743
|
+
raise HTTPException(
|
|
744
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
745
|
+
detail=f"Cannot delete worker queue with {len(active_workers)} active workers"
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
# Delete queue
|
|
749
|
+
db.delete(queue)
|
|
750
|
+
db.commit()
|
|
751
|
+
|
|
752
|
+
logger.info("worker_queue_deleted", queue_id=queue_id, org_id=org_id)
|
|
753
|
+
|
|
754
|
+
return None
|
|
755
|
+
|
|
756
|
+
except HTTPException:
|
|
757
|
+
raise
|
|
758
|
+
except Exception as e:
|
|
759
|
+
logger.error("worker_queue_delete_failed", error=str(e), queue_id=queue_id)
|
|
760
|
+
raise HTTPException(
|
|
761
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
762
|
+
detail=f"Failed to delete worker queue: {str(e)}"
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
@router.get("/worker-queues/{queue_id}/install-script")
|
|
767
|
+
@instrument_endpoint("worker_queues.get_installation_script")
|
|
768
|
+
async def get_installation_script(
|
|
769
|
+
queue_id: str,
|
|
770
|
+
deployment_type: Literal["docker", "kubernetes", "openshift", "local"] = "local",
|
|
771
|
+
request: Request = None,
|
|
772
|
+
organization: dict = Depends(get_current_organization),
|
|
773
|
+
db: Session = Depends(get_db),
|
|
774
|
+
):
|
|
775
|
+
"""
|
|
776
|
+
Generate an installation script for setting up a worker for this queue.
|
|
777
|
+
|
|
778
|
+
Supports multiple deployment types:
|
|
779
|
+
- local: Python virtual environment setup
|
|
780
|
+
- docker: Docker run command
|
|
781
|
+
- kubernetes: Kubernetes deployment YAML
|
|
782
|
+
- openshift: OpenShift deployment YAML
|
|
783
|
+
"""
|
|
784
|
+
try:
|
|
785
|
+
org_id = organization["id"]
|
|
786
|
+
|
|
787
|
+
# Get worker queue details with environment relationship
|
|
788
|
+
queue = (
|
|
789
|
+
db.query(WorkerQueue)
|
|
790
|
+
.options(joinedload(WorkerQueue.environment))
|
|
791
|
+
.filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
|
|
792
|
+
.first()
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
if not queue:
|
|
796
|
+
raise HTTPException(
|
|
797
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
798
|
+
detail="Worker queue not found"
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
# Get environment name from relationship
|
|
802
|
+
environment_name = "default"
|
|
803
|
+
if queue.environment:
|
|
804
|
+
environment_name = queue.environment.name
|
|
805
|
+
|
|
806
|
+
queue_name = queue.name
|
|
807
|
+
|
|
808
|
+
# Get control plane URL from the request that reached us
|
|
809
|
+
# This ensures installation scripts use the correct URL
|
|
810
|
+
control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
|
|
811
|
+
|
|
812
|
+
# Generate new worker ID
|
|
813
|
+
worker_id = str(uuid.uuid4())
|
|
814
|
+
|
|
815
|
+
# Generate script based on deployment type
|
|
816
|
+
if deployment_type == "local":
|
|
817
|
+
script = _generate_local_script(worker_id, control_plane_url)
|
|
818
|
+
elif deployment_type == "docker":
|
|
819
|
+
script = _generate_docker_script(worker_id, control_plane_url, queue_name, environment_name)
|
|
820
|
+
elif deployment_type == "kubernetes":
|
|
821
|
+
script = _generate_kubernetes_script(worker_id, control_plane_url, queue_name, environment_name)
|
|
822
|
+
elif deployment_type == "openshift":
|
|
823
|
+
script = _generate_openshift_script(worker_id, control_plane_url, queue_name, environment_name)
|
|
824
|
+
else:
|
|
825
|
+
raise HTTPException(
|
|
826
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
827
|
+
detail=f"Unsupported deployment type: {deployment_type}"
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
logger.info(
|
|
831
|
+
"installation_script_generated",
|
|
832
|
+
queue_id=queue_id,
|
|
833
|
+
deployment_type=deployment_type,
|
|
834
|
+
worker_id=worker_id,
|
|
835
|
+
org_id=org_id,
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
return PlainTextResponse(content=script, media_type="text/plain")
|
|
839
|
+
|
|
840
|
+
except HTTPException:
|
|
841
|
+
raise
|
|
842
|
+
except Exception as e:
|
|
843
|
+
logger.error("installation_script_generation_failed", error=str(e), queue_id=queue_id)
|
|
844
|
+
raise HTTPException(
|
|
845
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
846
|
+
detail=f"Failed to generate installation script: {str(e)}"
|
|
847
|
+
)
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
class WorkerSystemInfo(BaseModel):
|
|
851
|
+
"""Worker system information"""
|
|
852
|
+
hostname: Optional[str] = None
|
|
853
|
+
platform: Optional[str] = None
|
|
854
|
+
os_name: Optional[str] = None
|
|
855
|
+
os_version: Optional[str] = None
|
|
856
|
+
python_version: Optional[str] = None
|
|
857
|
+
cli_version: Optional[str] = None
|
|
858
|
+
sdk_version: Optional[str] = None # Worker SDK version
|
|
859
|
+
pid: Optional[int] = None # Process ID
|
|
860
|
+
cwd: Optional[str] = None # Current working directory
|
|
861
|
+
supported_runtimes: Optional[List[str]] = None # Available runtimes (e.g., ["agno", "claude_code"])
|
|
862
|
+
llm_gateway_url: Optional[str] = None # LiteLLM/LLM gateway URL
|
|
863
|
+
docker_available: Optional[bool] = None
|
|
864
|
+
docker_version: Optional[str] = None
|
|
865
|
+
cpu_count: Optional[int] = None
|
|
866
|
+
cpu_percent: Optional[float] = None
|
|
867
|
+
memory_total: Optional[int] = None
|
|
868
|
+
memory_used: Optional[int] = None
|
|
869
|
+
memory_percent: Optional[float] = None
|
|
870
|
+
disk_total: Optional[int] = None
|
|
871
|
+
disk_used: Optional[int] = None
|
|
872
|
+
disk_percent: Optional[float] = None
|
|
873
|
+
uptime_seconds: Optional[float] = None
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
class WorkerStartRequest(BaseModel):
|
|
877
|
+
"""Worker start request with SDK version and system info"""
|
|
878
|
+
worker_sdk_version: Optional[str] = None
|
|
879
|
+
system_info: Optional[WorkerSystemInfo] = None
|
|
880
|
+
control_plane_url: Optional[str] = None
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
class WorkerStartResponse(BaseModel):
|
|
884
|
+
"""Worker start configuration"""
|
|
885
|
+
worker_id: str
|
|
886
|
+
task_queue_name: str # The queue UUID
|
|
887
|
+
temporal_namespace: str
|
|
888
|
+
temporal_host: str
|
|
889
|
+
temporal_api_key: str
|
|
890
|
+
organization_id: str
|
|
891
|
+
control_plane_url: str
|
|
892
|
+
heartbeat_interval: int
|
|
893
|
+
# LiteLLM configuration for agno workflows/activities
|
|
894
|
+
litellm_api_url: str
|
|
895
|
+
litellm_api_key: str
|
|
896
|
+
# Queue metadata
|
|
897
|
+
queue_name: str
|
|
898
|
+
environment_name: str
|
|
899
|
+
queue_id: str # Queue UUID for cleanup
|
|
900
|
+
queue_ephemeral: bool = False # Whether queue is ephemeral
|
|
901
|
+
queue_single_execution: bool = False # Whether queue is for single execution
|
|
902
|
+
# Redis configuration for direct event streaming (default fast path)
|
|
903
|
+
redis_url: Optional[str] = None
|
|
904
|
+
redis_password: Optional[str] = None
|
|
905
|
+
redis_enabled: bool = False
|
|
906
|
+
# WebSocket configuration for per-worker persistent connections
|
|
907
|
+
websocket_enabled: bool = True
|
|
908
|
+
websocket_url: Optional[str] = None
|
|
909
|
+
websocket_features: List[str] = Field(default_factory=lambda: ["events", "control", "heartbeat", "config_update"])
|
|
910
|
+
# NATS configuration for high-performance event bus (optional)
|
|
911
|
+
nats_config: Optional[Dict[str, str]] = None
|
|
912
|
+
# SDK version for compatibility check
|
|
913
|
+
control_plane_sdk_version: str
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
@router.post("/worker-queues/{queue_id}/start", response_model=WorkerStartResponse)
|
|
917
|
+
@instrument_endpoint("worker_queues.start_worker_for_queue")
|
|
918
|
+
async def start_worker_for_queue(
|
|
919
|
+
queue_id: str,
|
|
920
|
+
request: Request,
|
|
921
|
+
body: WorkerStartRequest = WorkerStartRequest(),
|
|
922
|
+
organization: dict = Depends(get_current_organization),
|
|
923
|
+
db: Session = Depends(get_db),
|
|
924
|
+
):
|
|
925
|
+
"""
|
|
926
|
+
Start a worker for a specific queue.
|
|
927
|
+
|
|
928
|
+
This endpoint is called by the CLI with: kubiya worker start --queue-id={queue_id}
|
|
929
|
+
|
|
930
|
+
Returns all configuration needed for the worker to connect to Temporal.
|
|
931
|
+
"""
|
|
932
|
+
# Get control plane SDK version for compatibility check
|
|
933
|
+
from control_plane_api.version import get_sdk_version
|
|
934
|
+
control_plane_sdk_version = get_sdk_version()
|
|
935
|
+
|
|
936
|
+
# Log worker SDK version if provided
|
|
937
|
+
if body.worker_sdk_version:
|
|
938
|
+
logger.info(
|
|
939
|
+
"worker_registration_with_version",
|
|
940
|
+
queue_id=queue_id,
|
|
941
|
+
worker_sdk_version=body.worker_sdk_version,
|
|
942
|
+
control_plane_sdk_version=control_plane_sdk_version,
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
try:
|
|
946
|
+
org_id = organization["id"]
|
|
947
|
+
|
|
948
|
+
# Get worker queue with environment relationship
|
|
949
|
+
queue = (
|
|
950
|
+
db.query(WorkerQueue)
|
|
951
|
+
.options(joinedload(WorkerQueue.environment))
|
|
952
|
+
.filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
|
|
953
|
+
.first()
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
if not queue:
|
|
957
|
+
# Check if queue exists at all (might be in different org)
|
|
958
|
+
queue_check = (
|
|
959
|
+
db.query(WorkerQueue)
|
|
960
|
+
.filter(WorkerQueue.id == queue_id)
|
|
961
|
+
.first()
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
if queue_check:
|
|
965
|
+
raise HTTPException(
|
|
966
|
+
status_code=status.HTTP_403_FORBIDDEN,
|
|
967
|
+
detail=f"Worker queue '{queue_id}' not found in your organization"
|
|
968
|
+
)
|
|
969
|
+
else:
|
|
970
|
+
raise HTTPException(
|
|
971
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
972
|
+
detail=f"Worker queue '{queue_id}' does not exist. Please create a queue from the UI first."
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
# Check if environment is configured
|
|
976
|
+
if not queue.environment_id:
|
|
977
|
+
raise HTTPException(
|
|
978
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
979
|
+
detail=f"Worker queue '{queue.name}' has no environment configured. Please contact support."
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
if not queue.environment:
|
|
983
|
+
raise HTTPException(
|
|
984
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
985
|
+
detail=f"Environment configuration error for queue '{queue.name}'. Please contact support."
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
environment_name = queue.environment.name
|
|
989
|
+
|
|
990
|
+
# Check if queue is active
|
|
991
|
+
if queue.status != "active":
|
|
992
|
+
raise HTTPException(
|
|
993
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
994
|
+
detail=f"Worker queue is not active (status: {queue.status})"
|
|
995
|
+
)
|
|
996
|
+
|
|
997
|
+
# Get organization-specific Temporal credentials
|
|
998
|
+
import os
|
|
999
|
+
from control_plane_api.app.lib.temporal_credentials_service import (
|
|
1000
|
+
get_temporal_credentials_for_org,
|
|
1001
|
+
is_local_temporal
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
org_id = organization["id"]
|
|
1005
|
+
token = request.state.kubiya_token
|
|
1006
|
+
|
|
1007
|
+
# Check if local Temporal (for development)
|
|
1008
|
+
if is_local_temporal():
|
|
1009
|
+
logger.info("using_local_temporal_config", queue_id=queue_id, org_id=org_id)
|
|
1010
|
+
temporal_credentials = {
|
|
1011
|
+
"namespace": os.getenv("TEMPORAL_NAMESPACE", "default"),
|
|
1012
|
+
"api_key": "",
|
|
1013
|
+
"host": os.getenv("TEMPORAL_HOST", "localhost:7233"),
|
|
1014
|
+
"org": org_id,
|
|
1015
|
+
}
|
|
1016
|
+
else:
|
|
1017
|
+
# Fetch org-specific credentials from Kubiya API
|
|
1018
|
+
# use_fallback=True for backwards compatibility during rollout
|
|
1019
|
+
try:
|
|
1020
|
+
temporal_credentials = await get_temporal_credentials_for_org(
|
|
1021
|
+
org_id=org_id,
|
|
1022
|
+
token=token,
|
|
1023
|
+
use_fallback=True # Enable fallback during migration
|
|
1024
|
+
)
|
|
1025
|
+
|
|
1026
|
+
logger.info(
|
|
1027
|
+
"temporal_credentials_fetched_for_worker",
|
|
1028
|
+
queue_id=queue_id,
|
|
1029
|
+
org_id=org_id,
|
|
1030
|
+
namespace=temporal_credentials["namespace"],
|
|
1031
|
+
source="kubiya_api"
|
|
1032
|
+
)
|
|
1033
|
+
except Exception as e:
|
|
1034
|
+
logger.error(
|
|
1035
|
+
"temporal_credentials_fetch_failed",
|
|
1036
|
+
queue_id=queue_id,
|
|
1037
|
+
org_id=org_id,
|
|
1038
|
+
error=str(e)
|
|
1039
|
+
)
|
|
1040
|
+
# If fallback is enabled, this won't raise; if disabled, it will
|
|
1041
|
+
raise HTTPException(
|
|
1042
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1043
|
+
detail="Failed to fetch Temporal credentials. Please contact support."
|
|
1044
|
+
)
|
|
1045
|
+
|
|
1046
|
+
# For backwards compatibility with existing code
|
|
1047
|
+
namespace = {
|
|
1048
|
+
"namespace_name": temporal_credentials["namespace"],
|
|
1049
|
+
"api_key_encrypted": temporal_credentials["api_key"],
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
# Generate worker ID
|
|
1053
|
+
worker_id = str(uuid.uuid4())
|
|
1054
|
+
|
|
1055
|
+
# Use worker's provided URL (preserves user configuration)
|
|
1056
|
+
# Fallback to request URL for backward compatibility with old workers
|
|
1057
|
+
if body.control_plane_url:
|
|
1058
|
+
control_plane_url = body.control_plane_url.rstrip("/")
|
|
1059
|
+
logger.info(
|
|
1060
|
+
"using_worker_provided_control_plane_url",
|
|
1061
|
+
queue_id=queue_id,
|
|
1062
|
+
worker_url=control_plane_url,
|
|
1063
|
+
request_url=f"{request.url.scheme}://{request.url.netloc}"
|
|
1064
|
+
)
|
|
1065
|
+
else:
|
|
1066
|
+
# Backward compatibility for old workers
|
|
1067
|
+
control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
|
|
1068
|
+
logger.info(
|
|
1069
|
+
"using_request_derived_control_plane_url",
|
|
1070
|
+
queue_id=queue_id,
|
|
1071
|
+
control_plane_url=control_plane_url
|
|
1072
|
+
)
|
|
1073
|
+
temporal_host = temporal_credentials["host"]
|
|
1074
|
+
|
|
1075
|
+
# Get LiteLLM configuration for agno workflows/activities
|
|
1076
|
+
litellm_api_url = os.getenv("LITELLM_API_URL", "https://llm-proxy.kubiya.ai")
|
|
1077
|
+
litellm_api_key = os.getenv("LITELLM_API_KEY", "")
|
|
1078
|
+
|
|
1079
|
+
# Create worker heartbeat record
|
|
1080
|
+
|
|
1081
|
+
now = datetime.now(timezone.utc)
|
|
1082
|
+
worker_metadata = {}
|
|
1083
|
+
if body.system_info:
|
|
1084
|
+
worker_metadata = body.system_info.model_dump(exclude_none=True)
|
|
1085
|
+
logger.info(
|
|
1086
|
+
"worker_registration_with_system_info",
|
|
1087
|
+
worker_id=worker_id[:8],
|
|
1088
|
+
hostname=worker_metadata.get("hostname"),
|
|
1089
|
+
sdk_version=worker_metadata.get("sdk_version"),
|
|
1090
|
+
pid=worker_metadata.get("pid"),
|
|
1091
|
+
cwd=worker_metadata.get("cwd"),
|
|
1092
|
+
)
|
|
1093
|
+
|
|
1094
|
+
# Add LLM gateway URL from control plane config
|
|
1095
|
+
worker_metadata["llm_gateway_url"] = litellm_api_url
|
|
1096
|
+
|
|
1097
|
+
worker_heartbeat = WorkerHeartbeat(
|
|
1098
|
+
id=worker_id,
|
|
1099
|
+
worker_id=worker_id,
|
|
1100
|
+
organization_id=org_id,
|
|
1101
|
+
worker_queue_id=queue_id,
|
|
1102
|
+
environment_name=environment_name,
|
|
1103
|
+
status="active",
|
|
1104
|
+
tasks_processed=0,
|
|
1105
|
+
registered_at=now,
|
|
1106
|
+
last_heartbeat=now,
|
|
1107
|
+
updated_at=now,
|
|
1108
|
+
worker_metadata={},
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
db.add(worker_heartbeat)
|
|
1112
|
+
db.commit()
|
|
1113
|
+
|
|
1114
|
+
# Task queue name is just the queue UUID for security
|
|
1115
|
+
task_queue_name = queue_id
|
|
1116
|
+
|
|
1117
|
+
# Determine WebSocket configuration
|
|
1118
|
+
# WebSocket is only supported when control plane is NOT in serverless environment
|
|
1119
|
+
# (Vercel, AWS Lambda, etc. don't support persistent WebSocket connections)
|
|
1120
|
+
control_plane_env = detect_environment()
|
|
1121
|
+
websocket_enabled = (
|
|
1122
|
+
os.getenv("WEBSOCKET_ENABLED", "true").lower() == "true"
|
|
1123
|
+
and control_plane_env == "standard"
|
|
1124
|
+
)
|
|
1125
|
+
websocket_url = None
|
|
1126
|
+
|
|
1127
|
+
if websocket_enabled:
|
|
1128
|
+
# Convert HTTP(S) to WS(S) for WebSocket URL
|
|
1129
|
+
ws_base = control_plane_url.replace("https://", "wss://").replace("http://", "ws://")
|
|
1130
|
+
websocket_url = f"{ws_base}/api/v1/ws/workers/{worker_id}"
|
|
1131
|
+
|
|
1132
|
+
if not websocket_enabled and control_plane_env == "serverless":
|
|
1133
|
+
logger.info(
|
|
1134
|
+
"websocket_disabled_serverless_control_plane",
|
|
1135
|
+
worker_id=worker_id[:8],
|
|
1136
|
+
environment=control_plane_env
|
|
1137
|
+
)
|
|
1138
|
+
|
|
1139
|
+
# Redis configuration for direct event streaming (default fast path)
|
|
1140
|
+
# Workers will use Redis directly instead of HTTP endpoint for better performance
|
|
1141
|
+
redis_url = None
|
|
1142
|
+
redis_password = None
|
|
1143
|
+
redis_enabled = False
|
|
1144
|
+
|
|
1145
|
+
if settings.redis_url:
|
|
1146
|
+
redis_url = settings.redis_url
|
|
1147
|
+
redis_enabled = True
|
|
1148
|
+
|
|
1149
|
+
# Extract password from Redis URL if present (redis://:password@host:port/db)
|
|
1150
|
+
if "@" in redis_url and ":" in redis_url:
|
|
1151
|
+
try:
|
|
1152
|
+
# Parse URL to extract password
|
|
1153
|
+
from urllib.parse import urlparse
|
|
1154
|
+
parsed = urlparse(redis_url)
|
|
1155
|
+
if parsed.password:
|
|
1156
|
+
redis_password = parsed.password
|
|
1157
|
+
except Exception as e:
|
|
1158
|
+
logger.warning(
|
|
1159
|
+
"redis_password_extraction_failed",
|
|
1160
|
+
error=str(e),
|
|
1161
|
+
worker_id=worker_id[:8],
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
logger.info(
|
|
1165
|
+
"redis_config_provided_to_worker",
|
|
1166
|
+
worker_id=worker_id[:8],
|
|
1167
|
+
redis_url=redis_url.split("@")[-1] if "@" in redis_url else redis_url, # Log without password
|
|
1168
|
+
)
|
|
1169
|
+
|
|
1170
|
+
# NATS configuration (optional, enterprise-grade event bus)
|
|
1171
|
+
nats_config = None
|
|
1172
|
+
if (
|
|
1173
|
+
hasattr(settings, "event_bus")
|
|
1174
|
+
and settings.event_bus
|
|
1175
|
+
and isinstance(settings.event_bus, dict)
|
|
1176
|
+
and "nats" in settings.event_bus
|
|
1177
|
+
and settings.event_bus["nats"].get("enabled", False)
|
|
1178
|
+
):
|
|
1179
|
+
try:
|
|
1180
|
+
from control_plane_api.app.lib.nats import NATSCredentialsManager
|
|
1181
|
+
|
|
1182
|
+
# Get NATS operator credentials from settings/env
|
|
1183
|
+
nats_operator_jwt = os.getenv("NATS_OPERATOR_JWT")
|
|
1184
|
+
nats_operator_seed = os.getenv("NATS_OPERATOR_SEED")
|
|
1185
|
+
|
|
1186
|
+
if nats_operator_jwt and nats_operator_seed:
|
|
1187
|
+
# Create credentials manager
|
|
1188
|
+
creds_manager = NATSCredentialsManager(
|
|
1189
|
+
operator_jwt=nats_operator_jwt,
|
|
1190
|
+
operator_seed=nats_operator_seed,
|
|
1191
|
+
)
|
|
1192
|
+
|
|
1193
|
+
# Generate temporary worker credentials (24-hour TTL)
|
|
1194
|
+
worker_creds = creds_manager.create_worker_credentials(
|
|
1195
|
+
worker_id=worker_id,
|
|
1196
|
+
organization_id=org_id,
|
|
1197
|
+
ttl_hours=24,
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
# Get NATS URL from config
|
|
1201
|
+
nats_url = settings.event_bus["nats"].get("nats_url")
|
|
1202
|
+
|
|
1203
|
+
# Build NATS config for worker
|
|
1204
|
+
nats_config = {
|
|
1205
|
+
"nats_url": nats_url,
|
|
1206
|
+
"nats_jwt": worker_creds.jwt,
|
|
1207
|
+
"nats_seed": worker_creds.seed,
|
|
1208
|
+
"subject_prefix": worker_creds.subject_prefix,
|
|
1209
|
+
"organization_id": org_id,
|
|
1210
|
+
"worker_id": worker_id,
|
|
1211
|
+
"jetstream_enabled": str(
|
|
1212
|
+
settings.event_bus["nats"].get("jetstream_enabled", True)
|
|
1213
|
+
),
|
|
1214
|
+
"expires_at": worker_creds.expires_at.isoformat(),
|
|
1215
|
+
}
|
|
1216
|
+
|
|
1217
|
+
logger.info(
|
|
1218
|
+
"nats_credentials_generated_for_worker",
|
|
1219
|
+
worker_id=worker_id[:8],
|
|
1220
|
+
organization_id=org_id,
|
|
1221
|
+
subject_prefix=worker_creds.subject_prefix,
|
|
1222
|
+
expires_at=worker_creds.expires_at.isoformat(),
|
|
1223
|
+
)
|
|
1224
|
+
else:
|
|
1225
|
+
logger.warning(
|
|
1226
|
+
"nats_operator_credentials_not_configured",
|
|
1227
|
+
message="NATS enabled but NATS_OPERATOR_JWT or NATS_OPERATOR_SEED not set",
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
except ImportError:
|
|
1231
|
+
logger.warning(
|
|
1232
|
+
"nats_dependency_missing",
|
|
1233
|
+
message="NATS credentials generation skipped - nkeys not installed",
|
|
1234
|
+
)
|
|
1235
|
+
except Exception as e:
|
|
1236
|
+
logger.error(
|
|
1237
|
+
"nats_credentials_generation_failed",
|
|
1238
|
+
error=str(e),
|
|
1239
|
+
worker_id=worker_id[:8],
|
|
1240
|
+
)
|
|
1241
|
+
|
|
1242
|
+
logger.info(
|
|
1243
|
+
"worker_started_for_queue",
|
|
1244
|
+
worker_id=worker_id,
|
|
1245
|
+
queue_id=queue_id,
|
|
1246
|
+
task_queue_name=task_queue_name,
|
|
1247
|
+
org_id=org_id,
|
|
1248
|
+
websocket_enabled=websocket_enabled,
|
|
1249
|
+
nats_enabled=nats_config is not None,
|
|
1250
|
+
)
|
|
1251
|
+
|
|
1252
|
+
return WorkerStartResponse(
|
|
1253
|
+
worker_id=worker_id,
|
|
1254
|
+
task_queue_name=task_queue_name,
|
|
1255
|
+
temporal_namespace=namespace["namespace_name"],
|
|
1256
|
+
temporal_host=temporal_host,
|
|
1257
|
+
temporal_api_key=namespace["api_key_encrypted"],
|
|
1258
|
+
organization_id=org_id,
|
|
1259
|
+
control_plane_url=control_plane_url,
|
|
1260
|
+
heartbeat_interval=queue.heartbeat_interval or 60,
|
|
1261
|
+
litellm_api_url=litellm_api_url,
|
|
1262
|
+
litellm_api_key=litellm_api_key,
|
|
1263
|
+
queue_name=queue.name,
|
|
1264
|
+
environment_name=environment_name,
|
|
1265
|
+
queue_id=queue_id,
|
|
1266
|
+
queue_ephemeral=queue.ephemeral or False,
|
|
1267
|
+
queue_single_execution=queue.single_execution_mode or False,
|
|
1268
|
+
redis_url=redis_url,
|
|
1269
|
+
redis_password=redis_password,
|
|
1270
|
+
redis_enabled=redis_enabled,
|
|
1271
|
+
websocket_enabled=websocket_enabled,
|
|
1272
|
+
websocket_url=websocket_url,
|
|
1273
|
+
websocket_features=["events", "control", "heartbeat", "config_update"],
|
|
1274
|
+
nats_config=nats_config,
|
|
1275
|
+
control_plane_sdk_version=control_plane_sdk_version,
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
except HTTPException:
|
|
1279
|
+
raise
|
|
1280
|
+
except Exception as e:
|
|
1281
|
+
logger.error(
|
|
1282
|
+
"worker_start_for_queue_failed",
|
|
1283
|
+
error=str(e),
|
|
1284
|
+
error_type=type(e).__name__,
|
|
1285
|
+
queue_id=queue_id,
|
|
1286
|
+
org_id=organization.get("id")
|
|
1287
|
+
)
|
|
1288
|
+
raise HTTPException(
|
|
1289
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1290
|
+
detail=f"Failed to start worker due to an internal error. Please try again or contact support. (Error ID: {queue_id[:8]})"
|
|
1291
|
+
)
|
|
1292
|
+
|
|
1293
|
+
|
|
1294
|
+
def _generate_local_script(worker_id: str, control_plane_url: str) -> str:
|
|
1295
|
+
"""Generate a bash script for local Python installation"""
|
|
1296
|
+
return f"""#!/bin/bash
|
|
1297
|
+
# Kubiya Agent Worker - Local Installation Script
|
|
1298
|
+
# Generated: {datetime.now(timezone.utc).isoformat()}
|
|
1299
|
+
|
|
1300
|
+
set -e
|
|
1301
|
+
|
|
1302
|
+
echo "🚀 Setting up Kubiya Agent Worker..."
|
|
1303
|
+
echo ""
|
|
1304
|
+
|
|
1305
|
+
# Configuration
|
|
1306
|
+
WORKER_ID="{worker_id}"
|
|
1307
|
+
CONTROL_PLANE_URL="{control_plane_url}"
|
|
1308
|
+
|
|
1309
|
+
# Check if KUBIYA_API_KEY is set
|
|
1310
|
+
if [ -z "$KUBIYA_API_KEY" ]; then
|
|
1311
|
+
echo "❌ Error: KUBIYA_API_KEY environment variable is not set"
|
|
1312
|
+
echo "Please set it with: export KUBIYA_API_KEY=your-api-key"
|
|
1313
|
+
exit 1
|
|
1314
|
+
fi
|
|
1315
|
+
|
|
1316
|
+
# Check Python version
|
|
1317
|
+
if ! command -v python3 &> /dev/null; then
|
|
1318
|
+
echo "❌ Error: Python 3 is not installed"
|
|
1319
|
+
exit 1
|
|
1320
|
+
fi
|
|
1321
|
+
|
|
1322
|
+
PYTHON_VERSION=$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2)
|
|
1323
|
+
echo "✓ Found Python $PYTHON_VERSION"
|
|
1324
|
+
|
|
1325
|
+
# Create directory
|
|
1326
|
+
WORKER_DIR="$HOME/.kubiya/workers/$WORKER_ID"
|
|
1327
|
+
mkdir -p "$WORKER_DIR"
|
|
1328
|
+
cd "$WORKER_DIR"
|
|
1329
|
+
|
|
1330
|
+
echo "✓ Created worker directory: $WORKER_DIR"
|
|
1331
|
+
|
|
1332
|
+
# Create virtual environment
|
|
1333
|
+
echo "📦 Creating virtual environment..."
|
|
1334
|
+
python3 -m venv venv
|
|
1335
|
+
source venv/bin/activate
|
|
1336
|
+
|
|
1337
|
+
# Install worker package (includes all dependencies from pyproject.toml)
|
|
1338
|
+
echo "📦 Installing worker package..."
|
|
1339
|
+
if command -v uv &> /dev/null; then
|
|
1340
|
+
echo "✓ Using uv (fast mode)"
|
|
1341
|
+
uv pip install --quiet kubiya-control-plane-api[worker]
|
|
1342
|
+
else
|
|
1343
|
+
echo "ℹ️ Using pip (consider installing uv: https://github.com/astral-sh/uv)"
|
|
1344
|
+
pip install --quiet --upgrade pip
|
|
1345
|
+
pip install --quiet kubiya-control-plane-api[worker]
|
|
1346
|
+
fi
|
|
1347
|
+
|
|
1348
|
+
echo "✓ Worker package installed"
|
|
1349
|
+
|
|
1350
|
+
# Create systemd service file (optional)
|
|
1351
|
+
cat > kubiya-worker.service <<EOF
|
|
1352
|
+
[Unit]
|
|
1353
|
+
Description=Kubiya Agent Worker
|
|
1354
|
+
After=network.target
|
|
1355
|
+
|
|
1356
|
+
[Service]
|
|
1357
|
+
Type=simple
|
|
1358
|
+
User=$USER
|
|
1359
|
+
WorkingDirectory=$WORKER_DIR
|
|
1360
|
+
Environment="WORKER_ID=$WORKER_ID"
|
|
1361
|
+
Environment="KUBIYA_API_KEY=$KUBIYA_API_KEY"
|
|
1362
|
+
Environment="CONTROL_PLANE_URL=$CONTROL_PLANE_URL"
|
|
1363
|
+
ExecStart=$WORKER_DIR/venv/bin/python $WORKER_DIR/worker.py
|
|
1364
|
+
Restart=always
|
|
1365
|
+
RestartSec=10
|
|
1366
|
+
|
|
1367
|
+
[Install]
|
|
1368
|
+
WantedBy=multi-user.target
|
|
1369
|
+
EOF
|
|
1370
|
+
|
|
1371
|
+
echo "✓ Systemd service file created (optional)"
|
|
1372
|
+
|
|
1373
|
+
# Create run script
|
|
1374
|
+
cat > run.sh <<EOF
|
|
1375
|
+
#!/bin/bash
|
|
1376
|
+
cd "$WORKER_DIR"
|
|
1377
|
+
source venv/bin/activate
|
|
1378
|
+
export WORKER_ID="$WORKER_ID"
|
|
1379
|
+
export KUBIYA_API_KEY="$KUBIYA_API_KEY"
|
|
1380
|
+
export CONTROL_PLANE_URL="$CONTROL_PLANE_URL"
|
|
1381
|
+
python worker.py
|
|
1382
|
+
EOF
|
|
1383
|
+
|
|
1384
|
+
chmod +x run.sh
|
|
1385
|
+
|
|
1386
|
+
echo ""
|
|
1387
|
+
echo "✅ Installation complete!"
|
|
1388
|
+
echo ""
|
|
1389
|
+
echo "To start the worker:"
|
|
1390
|
+
echo " cd $WORKER_DIR && ./run.sh"
|
|
1391
|
+
echo ""
|
|
1392
|
+
echo "Or to install as a systemd service:"
|
|
1393
|
+
echo " sudo cp $WORKER_DIR/kubiya-worker.service /etc/systemd/system/"
|
|
1394
|
+
echo " sudo systemctl daemon-reload"
|
|
1395
|
+
echo " sudo systemctl enable kubiya-worker"
|
|
1396
|
+
echo " sudo systemctl start kubiya-worker"
|
|
1397
|
+
echo ""
|
|
1398
|
+
"""
|
|
1399
|
+
|
|
1400
|
+
|
|
1401
|
+
def _generate_docker_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
|
|
1402
|
+
"""Generate Docker commands for running the worker"""
|
|
1403
|
+
return f"""# Kubiya Agent Worker - Docker Installation
|
|
1404
|
+
# Generated: {datetime.now(timezone.utc).isoformat()}
|
|
1405
|
+
|
|
1406
|
+
# Configuration
|
|
1407
|
+
WORKER_ID="{worker_id}"
|
|
1408
|
+
CONTROL_PLANE_URL="{control_plane_url}"
|
|
1409
|
+
QUEUE_NAME="{queue_name}"
|
|
1410
|
+
ENVIRONMENT_NAME="{environment_name}"
|
|
1411
|
+
|
|
1412
|
+
# Make sure to set your API key
|
|
1413
|
+
# export KUBIYA_API_KEY=your-api-key
|
|
1414
|
+
|
|
1415
|
+
# Run with Docker
|
|
1416
|
+
docker run -d \\
|
|
1417
|
+
--name kubiya-worker-{queue_name}-{worker_id[:8]} \\
|
|
1418
|
+
--restart unless-stopped \\
|
|
1419
|
+
-e WORKER_ID="$WORKER_ID" \\
|
|
1420
|
+
-e KUBIYA_API_KEY="$KUBIYA_API_KEY" \\
|
|
1421
|
+
-e CONTROL_PLANE_URL="$CONTROL_PLANE_URL" \\
|
|
1422
|
+
-e LOG_LEVEL="INFO" \\
|
|
1423
|
+
kubiya/agent-worker:latest
|
|
1424
|
+
|
|
1425
|
+
# Check logs
|
|
1426
|
+
# docker logs -f kubiya-worker-{queue_name}-{worker_id[:8]}
|
|
1427
|
+
|
|
1428
|
+
# Stop worker
|
|
1429
|
+
# docker stop kubiya-worker-{queue_name}-{worker_id[:8]}
|
|
1430
|
+
|
|
1431
|
+
# Remove worker
|
|
1432
|
+
# docker rm kubiya-worker-{queue_name}-{worker_id[:8]}
|
|
1433
|
+
|
|
1434
|
+
# Docker Compose (save as docker-compose.yml)
|
|
1435
|
+
cat > docker-compose.yml <<EOF
|
|
1436
|
+
version: '3.8'
|
|
1437
|
+
|
|
1438
|
+
services:
|
|
1439
|
+
worker:
|
|
1440
|
+
image: kubiya/agent-worker:latest
|
|
1441
|
+
container_name: kubiya-worker-{queue_name}
|
|
1442
|
+
restart: unless-stopped
|
|
1443
|
+
environment:
|
|
1444
|
+
- WORKER_ID={worker_id}
|
|
1445
|
+
- KUBIYA_API_KEY=${{KUBIYA_API_KEY}}
|
|
1446
|
+
- CONTROL_PLANE_URL={control_plane_url}
|
|
1447
|
+
- LOG_LEVEL=INFO
|
|
1448
|
+
healthcheck:
|
|
1449
|
+
test: ["CMD", "python", "-c", "import httpx; httpx.get('{control_plane_url}/health')"]
|
|
1450
|
+
interval: 30s
|
|
1451
|
+
timeout: 10s
|
|
1452
|
+
retries: 3
|
|
1453
|
+
start_period: 10s
|
|
1454
|
+
EOF
|
|
1455
|
+
|
|
1456
|
+
# To use docker-compose:
|
|
1457
|
+
# docker-compose up -d
|
|
1458
|
+
"""
|
|
1459
|
+
|
|
1460
|
+
|
|
1461
|
+
def _generate_kubernetes_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
|
|
1462
|
+
"""Generate Kubernetes deployment YAML"""
|
|
1463
|
+
return f"""# Kubiya Agent Worker - Kubernetes Deployment
|
|
1464
|
+
# Generated: {datetime.now(timezone.utc).isoformat()}
|
|
1465
|
+
#
|
|
1466
|
+
# To deploy:
|
|
1467
|
+
# 1. Create secret: kubectl create secret generic kubiya-worker-secret --from-literal=api-key=YOUR_API_KEY
|
|
1468
|
+
# 2. Apply this file: kubectl apply -f kubiya-worker.yaml
|
|
1469
|
+
#
|
|
1470
|
+
---
|
|
1471
|
+
apiVersion: v1
|
|
1472
|
+
kind: ConfigMap
|
|
1473
|
+
metadata:
|
|
1474
|
+
name: kubiya-worker-{queue_name}-config
|
|
1475
|
+
labels:
|
|
1476
|
+
app: kubiya-worker
|
|
1477
|
+
queue: {queue_name}
|
|
1478
|
+
environment: {environment_name}
|
|
1479
|
+
data:
|
|
1480
|
+
WORKER_ID: "{worker_id}"
|
|
1481
|
+
CONTROL_PLANE_URL: "{control_plane_url}"
|
|
1482
|
+
LOG_LEVEL: "INFO"
|
|
1483
|
+
|
|
1484
|
+
---
|
|
1485
|
+
apiVersion: apps/v1
|
|
1486
|
+
kind: Deployment
|
|
1487
|
+
metadata:
|
|
1488
|
+
name: kubiya-worker-{queue_name}
|
|
1489
|
+
labels:
|
|
1490
|
+
app: kubiya-worker
|
|
1491
|
+
queue: {queue_name}
|
|
1492
|
+
environment: {environment_name}
|
|
1493
|
+
spec:
|
|
1494
|
+
replicas: 1
|
|
1495
|
+
selector:
|
|
1496
|
+
matchLabels:
|
|
1497
|
+
app: kubiya-worker
|
|
1498
|
+
queue: {queue_name}
|
|
1499
|
+
template:
|
|
1500
|
+
metadata:
|
|
1501
|
+
labels:
|
|
1502
|
+
app: kubiya-worker
|
|
1503
|
+
queue: {queue_name}
|
|
1504
|
+
environment: {environment_name}
|
|
1505
|
+
spec:
|
|
1506
|
+
containers:
|
|
1507
|
+
- name: worker
|
|
1508
|
+
image: kubiya/agent-worker:latest
|
|
1509
|
+
imagePullPolicy: Always
|
|
1510
|
+
envFrom:
|
|
1511
|
+
- configMapRef:
|
|
1512
|
+
name: kubiya-worker-{queue_name}-config
|
|
1513
|
+
env:
|
|
1514
|
+
- name: KUBIYA_API_KEY
|
|
1515
|
+
valueFrom:
|
|
1516
|
+
secretKeyRef:
|
|
1517
|
+
name: kubiya-worker-secret
|
|
1518
|
+
key: api-key
|
|
1519
|
+
resources:
|
|
1520
|
+
requests:
|
|
1521
|
+
memory: "512Mi"
|
|
1522
|
+
cpu: "250m"
|
|
1523
|
+
limits:
|
|
1524
|
+
memory: "2Gi"
|
|
1525
|
+
cpu: "1000m"
|
|
1526
|
+
livenessProbe:
|
|
1527
|
+
httpGet:
|
|
1528
|
+
path: /health
|
|
1529
|
+
port: 8080
|
|
1530
|
+
initialDelaySeconds: 30
|
|
1531
|
+
periodSeconds: 30
|
|
1532
|
+
timeoutSeconds: 10
|
|
1533
|
+
failureThreshold: 3
|
|
1534
|
+
readinessProbe:
|
|
1535
|
+
httpGet:
|
|
1536
|
+
path: /health
|
|
1537
|
+
port: 8080
|
|
1538
|
+
initialDelaySeconds: 10
|
|
1539
|
+
periodSeconds: 10
|
|
1540
|
+
timeoutSeconds: 5
|
|
1541
|
+
failureThreshold: 3
|
|
1542
|
+
restartPolicy: Always
|
|
1543
|
+
|
|
1544
|
+
---
|
|
1545
|
+
apiVersion: v1
|
|
1546
|
+
kind: Service
|
|
1547
|
+
metadata:
|
|
1548
|
+
name: kubiya-worker-{queue_name}
|
|
1549
|
+
labels:
|
|
1550
|
+
app: kubiya-worker
|
|
1551
|
+
queue: {queue_name}
|
|
1552
|
+
spec:
|
|
1553
|
+
selector:
|
|
1554
|
+
app: kubiya-worker
|
|
1555
|
+
queue: {queue_name}
|
|
1556
|
+
ports:
|
|
1557
|
+
- protocol: TCP
|
|
1558
|
+
port: 8080
|
|
1559
|
+
targetPort: 8080
|
|
1560
|
+
type: ClusterIP
|
|
1561
|
+
|
|
1562
|
+
---
|
|
1563
|
+
# Optional: HorizontalPodAutoscaler
|
|
1564
|
+
# apiVersion: autoscaling/v2
|
|
1565
|
+
# kind: HorizontalPodAutoscaler
|
|
1566
|
+
# metadata:
|
|
1567
|
+
# name: kubiya-worker-{queue_name}
|
|
1568
|
+
# spec:
|
|
1569
|
+
# scaleTargetRef:
|
|
1570
|
+
# apiVersion: apps/v1
|
|
1571
|
+
# kind: Deployment
|
|
1572
|
+
# name: kubiya-worker-{queue_name}
|
|
1573
|
+
# minReplicas: 1
|
|
1574
|
+
# maxReplicas: 10
|
|
1575
|
+
# metrics:
|
|
1576
|
+
# - type: Resource
|
|
1577
|
+
# resource:
|
|
1578
|
+
# name: cpu
|
|
1579
|
+
# target:
|
|
1580
|
+
# type: Utilization
|
|
1581
|
+
# averageUtilization: 70
|
|
1582
|
+
"""
|
|
1583
|
+
|
|
1584
|
+
|
|
1585
|
+
class WorkerQueueCommandResponse(BaseModel):
|
|
1586
|
+
"""Worker queue connection command"""
|
|
1587
|
+
queue_id: str
|
|
1588
|
+
command: str
|
|
1589
|
+
command_parts: dict
|
|
1590
|
+
can_register: bool
|
|
1591
|
+
queue_status: str
|
|
1592
|
+
active_workers: int
|
|
1593
|
+
max_workers: Optional[int]
|
|
1594
|
+
|
|
1595
|
+
|
|
1596
|
+
class WorkerDetail(BaseModel):
|
|
1597
|
+
"""Individual worker details"""
|
|
1598
|
+
id: str
|
|
1599
|
+
worker_id: str
|
|
1600
|
+
status: str
|
|
1601
|
+
tasks_processed: int
|
|
1602
|
+
current_task_id: Optional[str]
|
|
1603
|
+
last_heartbeat: str
|
|
1604
|
+
registered_at: str
|
|
1605
|
+
system_info: Optional[WorkerSystemInfo] = None
|
|
1606
|
+
logs: Optional[List[str]] = None
|
|
1607
|
+
worker_metadata: dict
|
|
1608
|
+
|
|
1609
|
+
|
|
1610
|
+
@router.get("/worker-queues/{queue_id}/workers", response_model=List[WorkerDetail])
|
|
1611
|
+
@instrument_endpoint("worker_queues.list_queue_workers")
|
|
1612
|
+
async def list_queue_workers(
|
|
1613
|
+
queue_id: str,
|
|
1614
|
+
request: Request,
|
|
1615
|
+
organization: dict = Depends(get_current_organization),
|
|
1616
|
+
db: Session = Depends(get_db),
|
|
1617
|
+
):
|
|
1618
|
+
"""
|
|
1619
|
+
List all workers for a specific queue with detailed information.
|
|
1620
|
+
"""
|
|
1621
|
+
try:
|
|
1622
|
+
org_id = organization["id"]
|
|
1623
|
+
|
|
1624
|
+
# Get active workers from Redis for this queue
|
|
1625
|
+
active_workers = await get_active_workers_from_redis(org_id, queue_id, db=db)
|
|
1626
|
+
|
|
1627
|
+
# Get worker registration details from database (registered_at, worker_id, worker_metadata)
|
|
1628
|
+
if active_workers:
|
|
1629
|
+
db_workers = (
|
|
1630
|
+
db.query(WorkerHeartbeat)
|
|
1631
|
+
.filter(
|
|
1632
|
+
WorkerHeartbeat.organization_id == org_id,
|
|
1633
|
+
WorkerHeartbeat.id.in_(list(active_workers.keys()))
|
|
1634
|
+
)
|
|
1635
|
+
.all()
|
|
1636
|
+
)
|
|
1637
|
+
db_workers_map = {str(w.id): w for w in db_workers}
|
|
1638
|
+
else:
|
|
1639
|
+
db_workers_map = {}
|
|
1640
|
+
|
|
1641
|
+
workers = []
|
|
1642
|
+
for worker_id, heartbeat_data in active_workers.items():
|
|
1643
|
+
# Get DB data for registration time
|
|
1644
|
+
db_data = db_workers_map.get(worker_id, None)
|
|
1645
|
+
|
|
1646
|
+
# Extract system info and logs from Redis heartbeat data
|
|
1647
|
+
metadata = heartbeat_data.get("metadata", {})
|
|
1648
|
+
system_info_data = heartbeat_data.get("system_info")
|
|
1649
|
+
logs = heartbeat_data.get("logs", [])
|
|
1650
|
+
|
|
1651
|
+
# Fall back to worker_metadata from database if system_info not in Redis
|
|
1652
|
+
if not system_info_data and db_data and db_data.worker_metadata:
|
|
1653
|
+
system_info_data = db_data.worker_metadata
|
|
1654
|
+
|
|
1655
|
+
system_info = WorkerSystemInfo(**system_info_data) if system_info_data else None
|
|
1656
|
+
|
|
1657
|
+
workers.append(
|
|
1658
|
+
WorkerDetail(
|
|
1659
|
+
id=worker_id,
|
|
1660
|
+
worker_id=db_data.worker_id if db_data else worker_id,
|
|
1661
|
+
status=heartbeat_data.get("status", "unknown"),
|
|
1662
|
+
tasks_processed=heartbeat_data.get("tasks_processed", 0),
|
|
1663
|
+
current_task_id=heartbeat_data.get("current_task_id"),
|
|
1664
|
+
last_heartbeat=heartbeat_data.get("last_heartbeat", ""),
|
|
1665
|
+
registered_at=db_data.registered_at.isoformat() if db_data and db_data.registered_at else "",
|
|
1666
|
+
system_info=system_info,
|
|
1667
|
+
logs=logs,
|
|
1668
|
+
worker_metadata=metadata,
|
|
1669
|
+
)
|
|
1670
|
+
)
|
|
1671
|
+
|
|
1672
|
+
# Sort by last_heartbeat desc
|
|
1673
|
+
workers.sort(key=lambda w: w.last_heartbeat, reverse=True)
|
|
1674
|
+
|
|
1675
|
+
logger.info(
|
|
1676
|
+
"queue_workers_listed",
|
|
1677
|
+
queue_id=queue_id,
|
|
1678
|
+
worker_count=len(workers),
|
|
1679
|
+
org_id=org_id,
|
|
1680
|
+
)
|
|
1681
|
+
|
|
1682
|
+
return workers
|
|
1683
|
+
|
|
1684
|
+
except HTTPException:
|
|
1685
|
+
raise
|
|
1686
|
+
except Exception as e:
|
|
1687
|
+
logger.error("queue_workers_list_failed", error=str(e), queue_id=queue_id)
|
|
1688
|
+
raise HTTPException(
|
|
1689
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1690
|
+
detail=f"Failed to list queue workers: {str(e)}"
|
|
1691
|
+
)
|
|
1692
|
+
|
|
1693
|
+
|
|
1694
|
+
@router.get("/worker-queues/{queue_id}/metrics", response_model=WorkerQueueMetricsResponse)
|
|
1695
|
+
@instrument_endpoint("worker_queues.get_worker_queue_metrics")
|
|
1696
|
+
async def get_worker_queue_metrics(
|
|
1697
|
+
queue_id: str,
|
|
1698
|
+
request: Request,
|
|
1699
|
+
organization: dict = Depends(get_current_organization),
|
|
1700
|
+
db: Session = Depends(get_db),
|
|
1701
|
+
):
|
|
1702
|
+
"""
|
|
1703
|
+
Get comprehensive metrics for a worker queue.
|
|
1704
|
+
|
|
1705
|
+
Returns worker health metrics, task statistics, and performance data.
|
|
1706
|
+
"""
|
|
1707
|
+
try:
|
|
1708
|
+
org_id = organization["id"]
|
|
1709
|
+
|
|
1710
|
+
# Use service layer for business logic
|
|
1711
|
+
metrics_service = WorkerQueueMetricsService(db)
|
|
1712
|
+
metrics = await metrics_service.get_queue_metrics(queue_id, org_id)
|
|
1713
|
+
|
|
1714
|
+
logger.info(
|
|
1715
|
+
"queue_metrics_retrieved",
|
|
1716
|
+
queue_id=queue_id,
|
|
1717
|
+
org_id=org_id
|
|
1718
|
+
)
|
|
1719
|
+
|
|
1720
|
+
return metrics
|
|
1721
|
+
|
|
1722
|
+
except ValueError as e:
|
|
1723
|
+
raise HTTPException(
|
|
1724
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
1725
|
+
detail=str(e)
|
|
1726
|
+
)
|
|
1727
|
+
except HTTPException:
|
|
1728
|
+
raise
|
|
1729
|
+
except Exception as e:
|
|
1730
|
+
logger.error(
|
|
1731
|
+
"queue_metrics_failed",
|
|
1732
|
+
error=str(e),
|
|
1733
|
+
queue_id=queue_id,
|
|
1734
|
+
org_id=org_id
|
|
1735
|
+
)
|
|
1736
|
+
raise HTTPException(
|
|
1737
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1738
|
+
detail=f"Failed to get queue metrics: {str(e)}"
|
|
1739
|
+
)
|
|
1740
|
+
|
|
1741
|
+
|
|
1742
|
+
@router.get("/worker-queues/{queue_id}/workflows", response_model=WorkflowsListResponse)
|
|
1743
|
+
@instrument_endpoint("worker_queues.list_queue_workflows")
|
|
1744
|
+
async def list_queue_workflows(
|
|
1745
|
+
queue_id: str,
|
|
1746
|
+
request: Request,
|
|
1747
|
+
status_filter: Optional[str] = None,
|
|
1748
|
+
limit: int = 100,
|
|
1749
|
+
organization: dict = Depends(get_current_organization),
|
|
1750
|
+
db: Session = Depends(get_db),
|
|
1751
|
+
):
|
|
1752
|
+
"""
|
|
1753
|
+
List workflows/tasks for a worker queue.
|
|
1754
|
+
|
|
1755
|
+
Returns list of workflows with status counts and filtering options.
|
|
1756
|
+
"""
|
|
1757
|
+
try:
|
|
1758
|
+
org_id = organization["id"]
|
|
1759
|
+
|
|
1760
|
+
# Import service here to avoid circular imports
|
|
1761
|
+
from control_plane_api.app.services.workflow_operations_service import WorkflowOperationsService
|
|
1762
|
+
|
|
1763
|
+
# Use service layer for business logic
|
|
1764
|
+
workflow_service = WorkflowOperationsService(db)
|
|
1765
|
+
workflows = await workflow_service.list_queue_workflows(
|
|
1766
|
+
queue_id=queue_id,
|
|
1767
|
+
organization_id=org_id,
|
|
1768
|
+
status_filter=status_filter,
|
|
1769
|
+
limit=limit
|
|
1770
|
+
)
|
|
1771
|
+
|
|
1772
|
+
logger.info(
|
|
1773
|
+
"queue_workflows_listed",
|
|
1774
|
+
queue_id=queue_id,
|
|
1775
|
+
total=workflows.total,
|
|
1776
|
+
org_id=org_id
|
|
1777
|
+
)
|
|
1778
|
+
|
|
1779
|
+
return workflows
|
|
1780
|
+
|
|
1781
|
+
except ValueError as e:
|
|
1782
|
+
raise HTTPException(
|
|
1783
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
1784
|
+
detail=str(e)
|
|
1785
|
+
)
|
|
1786
|
+
except HTTPException:
|
|
1787
|
+
raise
|
|
1788
|
+
except Exception as e:
|
|
1789
|
+
logger.error(
|
|
1790
|
+
"queue_workflows_list_failed",
|
|
1791
|
+
error=str(e),
|
|
1792
|
+
queue_id=queue_id,
|
|
1793
|
+
org_id=org_id
|
|
1794
|
+
)
|
|
1795
|
+
raise HTTPException(
|
|
1796
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1797
|
+
detail=f"Failed to list queue workflows: {str(e)}"
|
|
1798
|
+
)
|
|
1799
|
+
|
|
1800
|
+
|
|
1801
|
+
@router.get("/worker-queues/{queue_id}/worker-command", response_model=WorkerQueueCommandResponse)
|
|
1802
|
+
@instrument_endpoint("worker_queues.get_worker_queue_command")
|
|
1803
|
+
async def get_worker_queue_command(
|
|
1804
|
+
queue_id: str,
|
|
1805
|
+
request: Request,
|
|
1806
|
+
organization: dict = Depends(get_current_organization),
|
|
1807
|
+
db: Session = Depends(get_db),
|
|
1808
|
+
):
|
|
1809
|
+
"""
|
|
1810
|
+
Get the worker registration command for a specific worker queue.
|
|
1811
|
+
|
|
1812
|
+
Returns the kubiya worker start command with the queue ID that users
|
|
1813
|
+
should run to start a worker for this specific queue.
|
|
1814
|
+
"""
|
|
1815
|
+
try:
|
|
1816
|
+
org_id = organization["id"]
|
|
1817
|
+
|
|
1818
|
+
# Get worker queue
|
|
1819
|
+
queue = (
|
|
1820
|
+
db.query(WorkerQueue)
|
|
1821
|
+
.filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
|
|
1822
|
+
.first()
|
|
1823
|
+
)
|
|
1824
|
+
|
|
1825
|
+
if not queue:
|
|
1826
|
+
raise HTTPException(status_code=404, detail="Worker queue not found")
|
|
1827
|
+
|
|
1828
|
+
queue_status = queue.status or "unknown"
|
|
1829
|
+
|
|
1830
|
+
# Check if queue is active
|
|
1831
|
+
can_register = queue_status == "active"
|
|
1832
|
+
|
|
1833
|
+
# Get active workers from Redis for this specific queue
|
|
1834
|
+
active_workers_dict = await get_active_workers_from_redis(org_id, queue_id, db=db)
|
|
1835
|
+
active_worker_count = len(active_workers_dict)
|
|
1836
|
+
|
|
1837
|
+
# Build command
|
|
1838
|
+
command = f"kubiya worker start --queue-id {queue_id}"
|
|
1839
|
+
|
|
1840
|
+
command_parts = {
|
|
1841
|
+
"binary": "kubiya",
|
|
1842
|
+
"subcommand": "worker start",
|
|
1843
|
+
"flags": {
|
|
1844
|
+
"--queue-id": queue_id,
|
|
1845
|
+
},
|
|
1846
|
+
}
|
|
1847
|
+
|
|
1848
|
+
logger.info(
|
|
1849
|
+
"worker_queue_command_retrieved",
|
|
1850
|
+
queue_id=queue_id,
|
|
1851
|
+
can_register=can_register,
|
|
1852
|
+
status=queue_status,
|
|
1853
|
+
active_workers=active_worker_count,
|
|
1854
|
+
org_id=org_id,
|
|
1855
|
+
)
|
|
1856
|
+
|
|
1857
|
+
return WorkerQueueCommandResponse(
|
|
1858
|
+
queue_id=queue_id,
|
|
1859
|
+
command=command,
|
|
1860
|
+
command_parts=command_parts,
|
|
1861
|
+
can_register=can_register,
|
|
1862
|
+
queue_status=queue_status,
|
|
1863
|
+
active_workers=active_worker_count,
|
|
1864
|
+
max_workers=queue.max_workers,
|
|
1865
|
+
)
|
|
1866
|
+
|
|
1867
|
+
except HTTPException:
|
|
1868
|
+
raise
|
|
1869
|
+
except Exception as e:
|
|
1870
|
+
logger.error("worker_queue_command_failed", error=str(e), queue_id=queue_id)
|
|
1871
|
+
raise HTTPException(
|
|
1872
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1873
|
+
detail=f"Failed to get worker queue command: {str(e)}"
|
|
1874
|
+
)
|
|
1875
|
+
|
|
1876
|
+
|
|
1877
|
+
def _generate_openshift_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
|
|
1878
|
+
"""Generate OpenShift deployment YAML"""
|
|
1879
|
+
return f"""# Kubiya Agent Worker - OpenShift Deployment
|
|
1880
|
+
# Generated: {datetime.now(timezone.utc).isoformat()}
|
|
1881
|
+
#
|
|
1882
|
+
# To deploy:
|
|
1883
|
+
# 1. Create secret: oc create secret generic kubiya-worker-secret --from-literal=api-key=YOUR_API_KEY
|
|
1884
|
+
# 2. Apply this file: oc apply -f kubiya-worker.yaml
|
|
1885
|
+
#
|
|
1886
|
+
---
|
|
1887
|
+
apiVersion: v1
|
|
1888
|
+
kind: ConfigMap
|
|
1889
|
+
metadata:
|
|
1890
|
+
name: kubiya-worker-{queue_name}-config
|
|
1891
|
+
labels:
|
|
1892
|
+
app: kubiya-worker
|
|
1893
|
+
queue: {queue_name}
|
|
1894
|
+
environment: {environment_name}
|
|
1895
|
+
data:
|
|
1896
|
+
WORKER_ID: "{worker_id}"
|
|
1897
|
+
CONTROL_PLANE_URL: "{control_plane_url}"
|
|
1898
|
+
LOG_LEVEL: "INFO"
|
|
1899
|
+
|
|
1900
|
+
---
|
|
1901
|
+
apiVersion: apps.openshift.io/v1
|
|
1902
|
+
kind: DeploymentConfig
|
|
1903
|
+
metadata:
|
|
1904
|
+
name: kubiya-worker-{queue_name}
|
|
1905
|
+
labels:
|
|
1906
|
+
app: kubiya-worker
|
|
1907
|
+
queue: {queue_name}
|
|
1908
|
+
environment: {environment_name}
|
|
1909
|
+
spec:
|
|
1910
|
+
replicas: 1
|
|
1911
|
+
selector:
|
|
1912
|
+
app: kubiya-worker
|
|
1913
|
+
queue: {queue_name}
|
|
1914
|
+
template:
|
|
1915
|
+
metadata:
|
|
1916
|
+
labels:
|
|
1917
|
+
app: kubiya-worker
|
|
1918
|
+
queue: {queue_name}
|
|
1919
|
+
environment: {environment_name}
|
|
1920
|
+
spec:
|
|
1921
|
+
containers:
|
|
1922
|
+
- name: worker
|
|
1923
|
+
image: kubiya/agent-worker:latest
|
|
1924
|
+
imagePullPolicy: Always
|
|
1925
|
+
envFrom:
|
|
1926
|
+
- configMapRef:
|
|
1927
|
+
name: kubiya-worker-{queue_name}-config
|
|
1928
|
+
env:
|
|
1929
|
+
- name: KUBIYA_API_KEY
|
|
1930
|
+
valueFrom:
|
|
1931
|
+
secretKeyRef:
|
|
1932
|
+
name: kubiya-worker-secret
|
|
1933
|
+
key: api-key
|
|
1934
|
+
resources:
|
|
1935
|
+
requests:
|
|
1936
|
+
memory: "512Mi"
|
|
1937
|
+
cpu: "250m"
|
|
1938
|
+
limits:
|
|
1939
|
+
memory: "2Gi"
|
|
1940
|
+
cpu: "1000m"
|
|
1941
|
+
livenessProbe:
|
|
1942
|
+
httpGet:
|
|
1943
|
+
path: /health
|
|
1944
|
+
port: 8080
|
|
1945
|
+
initialDelaySeconds: 30
|
|
1946
|
+
periodSeconds: 30
|
|
1947
|
+
timeoutSeconds: 10
|
|
1948
|
+
failureThreshold: 3
|
|
1949
|
+
readinessProbe:
|
|
1950
|
+
httpGet:
|
|
1951
|
+
path: /health
|
|
1952
|
+
port: 8080
|
|
1953
|
+
initialDelaySeconds: 10
|
|
1954
|
+
periodSeconds: 10
|
|
1955
|
+
timeoutSeconds: 5
|
|
1956
|
+
failureThreshold: 3
|
|
1957
|
+
restartPolicy: Always
|
|
1958
|
+
securityContext:
|
|
1959
|
+
runAsNonRoot: true
|
|
1960
|
+
runAsUser: 1000
|
|
1961
|
+
triggers:
|
|
1962
|
+
- type: ConfigChange
|
|
1963
|
+
- type: ImageChange
|
|
1964
|
+
imageChangeParams:
|
|
1965
|
+
automatic: true
|
|
1966
|
+
containerNames:
|
|
1967
|
+
- worker
|
|
1968
|
+
from:
|
|
1969
|
+
kind: ImageStreamTag
|
|
1970
|
+
name: agent-worker:latest
|
|
1971
|
+
|
|
1972
|
+
---
|
|
1973
|
+
apiVersion: v1
|
|
1974
|
+
kind: Service
|
|
1975
|
+
metadata:
|
|
1976
|
+
name: kubiya-worker-{queue_name}
|
|
1977
|
+
labels:
|
|
1978
|
+
app: kubiya-worker
|
|
1979
|
+
queue: {queue_name}
|
|
1980
|
+
spec:
|
|
1981
|
+
selector:
|
|
1982
|
+
app: kubiya-worker
|
|
1983
|
+
queue: {queue_name}
|
|
1984
|
+
ports:
|
|
1985
|
+
- protocol: TCP
|
|
1986
|
+
port: 8080
|
|
1987
|
+
targetPort: 8080
|
|
1988
|
+
type: ClusterIP
|
|
1989
|
+
|
|
1990
|
+
---
|
|
1991
|
+
# Optional: Route to expose the service
|
|
1992
|
+
# apiVersion: route.openshift.io/v1
|
|
1993
|
+
# kind: Route
|
|
1994
|
+
# metadata:
|
|
1995
|
+
# name: kubiya-worker-{queue_name}
|
|
1996
|
+
# labels:
|
|
1997
|
+
# app: kubiya-worker
|
|
1998
|
+
# queue: {queue_name}
|
|
1999
|
+
# spec:
|
|
2000
|
+
# to:
|
|
2001
|
+
# kind: Service
|
|
2002
|
+
# name: kubiya-worker-{queue_name}
|
|
2003
|
+
# port:
|
|
2004
|
+
# targetPort: 8080
|
|
2005
|
+
# tls:
|
|
2006
|
+
# termination: edge
|
|
2007
|
+
# insecureEdgeTerminationPolicy: Redirect
|
|
2008
|
+
"""
|
|
2009
|
+
|
|
2010
|
+
|
|
2011
|
+
# ============================================================================
|
|
2012
|
+
# Worker Auto-Update Endpoints
|
|
2013
|
+
# ============================================================================
|
|
2014
|
+
|
|
2015
|
+
|
|
2016
|
+
class WorkerQueueConfigResponse(BaseModel):
|
|
2017
|
+
"""Worker queue configuration with version tracking for auto-updates"""
|
|
2018
|
+
queue_id: str
|
|
2019
|
+
name: str
|
|
2020
|
+
display_name: Optional[str]
|
|
2021
|
+
description: Optional[str]
|
|
2022
|
+
status: str
|
|
2023
|
+
max_workers: Optional[int]
|
|
2024
|
+
heartbeat_interval: int
|
|
2025
|
+
tags: List[str]
|
|
2026
|
+
settings: dict
|
|
2027
|
+
config_version: str # SHA256 hash of configuration for change detection
|
|
2028
|
+
config_updated_at: str # Timestamp of last configuration change
|
|
2029
|
+
recommended_package_version: Optional[str] = None # Latest recommended worker package version
|
|
2030
|
+
environment_id: str
|
|
2031
|
+
environment_name: str
|
|
2032
|
+
|
|
2033
|
+
|
|
2034
|
+
class UpdateLockRequest(BaseModel):
|
|
2035
|
+
"""Request to acquire an update lock for coordinated rolling updates"""
|
|
2036
|
+
worker_id: str
|
|
2037
|
+
lock_duration_seconds: int = Field(default=300, ge=60, le=600, description="Lock TTL (60-600 seconds)")
|
|
2038
|
+
|
|
2039
|
+
|
|
2040
|
+
class UpdateLockResponse(BaseModel):
|
|
2041
|
+
"""Response with update lock information"""
|
|
2042
|
+
lock_id: str
|
|
2043
|
+
worker_id: str
|
|
2044
|
+
queue_id: str
|
|
2045
|
+
acquired_at: str
|
|
2046
|
+
expires_at: str
|
|
2047
|
+
locked: bool
|
|
2048
|
+
|
|
2049
|
+
|
|
2050
|
+
def _compute_config_hash(queue: dict) -> str:
|
|
2051
|
+
"""
|
|
2052
|
+
Compute SHA256 hash of worker queue configuration.
|
|
2053
|
+
|
|
2054
|
+
This hash is used to detect configuration changes for auto-updates.
|
|
2055
|
+
Only includes fields that affect worker behavior.
|
|
2056
|
+
"""
|
|
2057
|
+
config_data = {
|
|
2058
|
+
"name": queue.get("name"),
|
|
2059
|
+
"status": queue.get("status"),
|
|
2060
|
+
"max_workers": queue.get("max_workers"),
|
|
2061
|
+
"heartbeat_interval": queue.get("heartbeat_interval"),
|
|
2062
|
+
"tags": sorted(queue.get("tags", [])), # Sort for consistency
|
|
2063
|
+
"settings": queue.get("settings", {}),
|
|
2064
|
+
}
|
|
2065
|
+
|
|
2066
|
+
# Serialize to JSON with sorted keys for consistent hashing
|
|
2067
|
+
config_json = json.dumps(config_data, sort_keys=True)
|
|
2068
|
+
return hashlib.sha256(config_json.encode()).hexdigest()
|
|
2069
|
+
|
|
2070
|
+
|
|
2071
|
+
@router.get("/worker-queues/{queue_id}/config", response_model=WorkerQueueConfigResponse)
|
|
2072
|
+
@instrument_endpoint("worker_queues.get_worker_queue_config")
|
|
2073
|
+
async def get_worker_queue_config(
|
|
2074
|
+
queue_id: str,
|
|
2075
|
+
request: Request,
|
|
2076
|
+
organization: dict = Depends(get_current_organization),
|
|
2077
|
+
db: Session = Depends(get_db),
|
|
2078
|
+
):
|
|
2079
|
+
"""
|
|
2080
|
+
Get worker queue configuration with version tracking for auto-updates.
|
|
2081
|
+
|
|
2082
|
+
This endpoint is called by CLI workers periodically to check for configuration changes.
|
|
2083
|
+
The config_version hash allows workers to detect when they need to reload.
|
|
2084
|
+
|
|
2085
|
+
Args:
|
|
2086
|
+
queue_id: Worker queue ID
|
|
2087
|
+
|
|
2088
|
+
Returns:
|
|
2089
|
+
Configuration with version hash and recommended package version
|
|
2090
|
+
"""
|
|
2091
|
+
try:
|
|
2092
|
+
org_id = organization["id"]
|
|
2093
|
+
|
|
2094
|
+
# Get worker queue with environment relationship
|
|
2095
|
+
queue = (
|
|
2096
|
+
db.query(WorkerQueue)
|
|
2097
|
+
.options(joinedload(WorkerQueue.environment))
|
|
2098
|
+
.filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
|
|
2099
|
+
.first()
|
|
2100
|
+
)
|
|
2101
|
+
|
|
2102
|
+
if not queue:
|
|
2103
|
+
raise HTTPException(
|
|
2104
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
2105
|
+
detail="Worker queue not found"
|
|
2106
|
+
)
|
|
2107
|
+
|
|
2108
|
+
# Get environment name from relationship
|
|
2109
|
+
environment_name = queue.environment.name if queue.environment else "unknown"
|
|
2110
|
+
|
|
2111
|
+
# Convert queue to dict for config hash computation
|
|
2112
|
+
from sqlalchemy.inspection import inspect
|
|
2113
|
+
queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
|
|
2114
|
+
|
|
2115
|
+
# Compute configuration hash for change detection
|
|
2116
|
+
config_version = _compute_config_hash(queue_dict)
|
|
2117
|
+
|
|
2118
|
+
# Get recommended package version from control plane settings or PyPI
|
|
2119
|
+
# This can be configured via environment variable or fetched from PyPI
|
|
2120
|
+
recommended_package_version = os.getenv("KUBIYA_RECOMMENDED_WORKER_VERSION")
|
|
2121
|
+
if not recommended_package_version:
|
|
2122
|
+
# Fetch latest version from PyPI (cached for performance)
|
|
2123
|
+
try:
|
|
2124
|
+
import httpx
|
|
2125
|
+
response = httpx.get("https://pypi.org/pypi/kubiya-control-plane-api/json", timeout=5.0)
|
|
2126
|
+
if response.status_code == 200:
|
|
2127
|
+
pypi_data = response.json()
|
|
2128
|
+
recommended_package_version = pypi_data.get("info", {}).get("version")
|
|
2129
|
+
except Exception as e:
|
|
2130
|
+
logger.warning(
|
|
2131
|
+
"failed_to_fetch_pypi_version",
|
|
2132
|
+
error=str(e),
|
|
2133
|
+
queue_id=queue_id,
|
|
2134
|
+
)
|
|
2135
|
+
# Fallback: no recommendation if PyPI fetch fails
|
|
2136
|
+
recommended_package_version = None
|
|
2137
|
+
|
|
2138
|
+
logger.info(
|
|
2139
|
+
"worker_queue_config_fetched",
|
|
2140
|
+
queue_id=queue_id,
|
|
2141
|
+
config_version=config_version[:8], # Log first 8 chars of hash
|
|
2142
|
+
org_id=org_id,
|
|
2143
|
+
)
|
|
2144
|
+
|
|
2145
|
+
return WorkerQueueConfigResponse(
|
|
2146
|
+
queue_id=queue_id,
|
|
2147
|
+
name=queue.name,
|
|
2148
|
+
display_name=queue.display_name,
|
|
2149
|
+
description=queue.description,
|
|
2150
|
+
status=queue.status,
|
|
2151
|
+
max_workers=queue.max_workers,
|
|
2152
|
+
heartbeat_interval=queue.heartbeat_interval or 60,
|
|
2153
|
+
tags=queue.tags or [],
|
|
2154
|
+
settings=queue.settings or {},
|
|
2155
|
+
config_version=config_version,
|
|
2156
|
+
config_updated_at=queue.updated_at.isoformat() if queue.updated_at else queue.created_at.isoformat(),
|
|
2157
|
+
recommended_package_version=recommended_package_version,
|
|
2158
|
+
environment_id=str(queue.environment_id),
|
|
2159
|
+
environment_name=environment_name,
|
|
2160
|
+
)
|
|
2161
|
+
|
|
2162
|
+
except HTTPException:
|
|
2163
|
+
raise
|
|
2164
|
+
except Exception as e:
|
|
2165
|
+
logger.error("worker_queue_config_fetch_failed", error=str(e), queue_id=queue_id)
|
|
2166
|
+
raise HTTPException(
|
|
2167
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
2168
|
+
detail=f"Failed to fetch worker queue config: {str(e)}"
|
|
2169
|
+
)
|
|
2170
|
+
|
|
2171
|
+
|
|
2172
|
+
@router.post("/worker-queues/{queue_id}/workers/{worker_id}/update-lock", response_model=UpdateLockResponse)
|
|
2173
|
+
@instrument_endpoint("worker_queues.acquire_update_lock")
|
|
2174
|
+
async def acquire_update_lock(
|
|
2175
|
+
queue_id: str,
|
|
2176
|
+
worker_id: str,
|
|
2177
|
+
lock_request: UpdateLockRequest,
|
|
2178
|
+
request: Request,
|
|
2179
|
+
organization: dict = Depends(get_current_organization),
|
|
2180
|
+
db: Session = Depends(get_db),
|
|
2181
|
+
):
|
|
2182
|
+
"""
|
|
2183
|
+
Acquire an update lock for coordinated rolling updates.
|
|
2184
|
+
|
|
2185
|
+
This ensures only one worker in a queue updates at a time.
|
|
2186
|
+
Uses Redis for distributed locking with automatic TTL expiration.
|
|
2187
|
+
|
|
2188
|
+
Args:
|
|
2189
|
+
queue_id: Worker queue ID
|
|
2190
|
+
worker_id: Worker ID requesting the lock
|
|
2191
|
+
lock_request: Lock configuration (duration)
|
|
2192
|
+
|
|
2193
|
+
Returns:
|
|
2194
|
+
Lock information if acquired, or error if another worker holds the lock
|
|
2195
|
+
"""
|
|
2196
|
+
try:
|
|
2197
|
+
org_id = organization["id"]
|
|
2198
|
+
redis_client = get_redis_client()
|
|
2199
|
+
|
|
2200
|
+
if not redis_client:
|
|
2201
|
+
raise HTTPException(
|
|
2202
|
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
2203
|
+
detail="Update coordination unavailable (Redis not configured)"
|
|
2204
|
+
)
|
|
2205
|
+
|
|
2206
|
+
# Verify queue exists and worker belongs to this queue
|
|
2207
|
+
queue = (
|
|
2208
|
+
db.query(WorkerQueue)
|
|
2209
|
+
.filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
|
|
2210
|
+
.first()
|
|
2211
|
+
)
|
|
2212
|
+
|
|
2213
|
+
if not queue:
|
|
2214
|
+
raise HTTPException(
|
|
2215
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
2216
|
+
detail="Worker queue not found"
|
|
2217
|
+
)
|
|
2218
|
+
|
|
2219
|
+
# Check if worker exists (optional - for validation)
|
|
2220
|
+
worker_heartbeat_key = f"worker:{worker_id}:heartbeat"
|
|
2221
|
+
worker_data = await redis_client.get(worker_heartbeat_key)
|
|
2222
|
+
|
|
2223
|
+
if not worker_data:
|
|
2224
|
+
logger.warning(
|
|
2225
|
+
"worker_not_found_in_heartbeats",
|
|
2226
|
+
worker_id=worker_id,
|
|
2227
|
+
queue_id=queue_id,
|
|
2228
|
+
org_id=org_id,
|
|
2229
|
+
)
|
|
2230
|
+
|
|
2231
|
+
# Try to acquire lock using Redis SET NX (set if not exists)
|
|
2232
|
+
lock_key = f"worker_queue:{queue_id}:update_lock"
|
|
2233
|
+
lock_id = str(uuid.uuid4())
|
|
2234
|
+
now = datetime.now(timezone.utc)
|
|
2235
|
+
expires_at = now + timedelta(seconds=lock_request.lock_duration_seconds)
|
|
2236
|
+
|
|
2237
|
+
lock_data = {
|
|
2238
|
+
"lock_id": lock_id,
|
|
2239
|
+
"worker_id": worker_id,
|
|
2240
|
+
"queue_id": queue_id,
|
|
2241
|
+
"organization_id": org_id,
|
|
2242
|
+
"acquired_at": now.isoformat(),
|
|
2243
|
+
"expires_at": expires_at.isoformat(),
|
|
2244
|
+
}
|
|
2245
|
+
|
|
2246
|
+
# SET NX EX: Set if not exists with expiration
|
|
2247
|
+
acquired = await redis_client.set(
|
|
2248
|
+
lock_key,
|
|
2249
|
+
json.dumps(lock_data),
|
|
2250
|
+
ex=lock_request.lock_duration_seconds,
|
|
2251
|
+
nx=True, # Only set if key doesn't exist
|
|
2252
|
+
)
|
|
2253
|
+
|
|
2254
|
+
if not acquired:
|
|
2255
|
+
# Lock already held by another worker
|
|
2256
|
+
existing_lock_data = await redis_client.get(lock_key)
|
|
2257
|
+
if existing_lock_data:
|
|
2258
|
+
existing_lock = json.loads(existing_lock_data)
|
|
2259
|
+
logger.info(
|
|
2260
|
+
"update_lock_already_held",
|
|
2261
|
+
queue_id=queue_id,
|
|
2262
|
+
requesting_worker=worker_id,
|
|
2263
|
+
lock_holder=existing_lock.get("worker_id"),
|
|
2264
|
+
org_id=org_id,
|
|
2265
|
+
)
|
|
2266
|
+
raise HTTPException(
|
|
2267
|
+
status_code=status.HTTP_409_CONFLICT,
|
|
2268
|
+
detail=f"Update lock already held by worker {existing_lock.get('worker_id')}"
|
|
2269
|
+
)
|
|
2270
|
+
else:
|
|
2271
|
+
# Race condition: lock was released between check and get
|
|
2272
|
+
logger.warning("update_lock_race_condition", queue_id=queue_id, worker_id=worker_id)
|
|
2273
|
+
raise HTTPException(
|
|
2274
|
+
status_code=status.HTTP_409_CONFLICT,
|
|
2275
|
+
detail="Failed to acquire lock due to race condition, please retry"
|
|
2276
|
+
)
|
|
2277
|
+
|
|
2278
|
+
logger.info(
|
|
2279
|
+
"update_lock_acquired",
|
|
2280
|
+
lock_id=lock_id,
|
|
2281
|
+
worker_id=worker_id,
|
|
2282
|
+
queue_id=queue_id,
|
|
2283
|
+
duration_seconds=lock_request.lock_duration_seconds,
|
|
2284
|
+
org_id=org_id,
|
|
2285
|
+
)
|
|
2286
|
+
|
|
2287
|
+
return UpdateLockResponse(
|
|
2288
|
+
lock_id=lock_id,
|
|
2289
|
+
worker_id=worker_id,
|
|
2290
|
+
queue_id=queue_id,
|
|
2291
|
+
acquired_at=now.isoformat(),
|
|
2292
|
+
expires_at=expires_at.isoformat(),
|
|
2293
|
+
locked=True,
|
|
2294
|
+
)
|
|
2295
|
+
|
|
2296
|
+
except HTTPException:
|
|
2297
|
+
raise
|
|
2298
|
+
except Exception as e:
|
|
2299
|
+
logger.error(
|
|
2300
|
+
"update_lock_acquisition_failed",
|
|
2301
|
+
error=str(e),
|
|
2302
|
+
queue_id=queue_id,
|
|
2303
|
+
worker_id=worker_id,
|
|
2304
|
+
)
|
|
2305
|
+
raise HTTPException(
|
|
2306
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
2307
|
+
detail=f"Failed to acquire update lock: {str(e)}"
|
|
2308
|
+
)
|
|
2309
|
+
|
|
2310
|
+
|
|
2311
|
+
@router.delete("/worker-queues/{queue_id}/workers/{worker_id}/update-lock", status_code=status.HTTP_204_NO_CONTENT)
|
|
2312
|
+
@instrument_endpoint("worker_queues.release_update_lock")
|
|
2313
|
+
async def release_update_lock(
|
|
2314
|
+
queue_id: str,
|
|
2315
|
+
worker_id: str,
|
|
2316
|
+
request: Request,
|
|
2317
|
+
organization: dict = Depends(get_current_organization),
|
|
2318
|
+
):
|
|
2319
|
+
"""
|
|
2320
|
+
Release an update lock after worker has completed its update.
|
|
2321
|
+
|
|
2322
|
+
Only the worker that acquired the lock can release it (verified by worker_id).
|
|
2323
|
+
|
|
2324
|
+
Args:
|
|
2325
|
+
queue_id: Worker queue ID
|
|
2326
|
+
worker_id: Worker ID that holds the lock
|
|
2327
|
+
"""
|
|
2328
|
+
try:
|
|
2329
|
+
org_id = organization["id"]
|
|
2330
|
+
redis_client = get_redis_client()
|
|
2331
|
+
|
|
2332
|
+
if not redis_client:
|
|
2333
|
+
# If Redis is unavailable, just return success (lock will expire naturally)
|
|
2334
|
+
logger.warning(
|
|
2335
|
+
"redis_unavailable_for_lock_release",
|
|
2336
|
+
queue_id=queue_id,
|
|
2337
|
+
worker_id=worker_id,
|
|
2338
|
+
org_id=org_id,
|
|
2339
|
+
)
|
|
2340
|
+
return None
|
|
2341
|
+
|
|
2342
|
+
lock_key = f"worker_queue:{queue_id}:update_lock"
|
|
2343
|
+
|
|
2344
|
+
# Get current lock to verify ownership
|
|
2345
|
+
lock_data_str = await redis_client.get(lock_key)
|
|
2346
|
+
|
|
2347
|
+
if not lock_data_str:
|
|
2348
|
+
# Lock doesn't exist (already expired or never acquired)
|
|
2349
|
+
logger.info(
|
|
2350
|
+
"update_lock_not_found",
|
|
2351
|
+
queue_id=queue_id,
|
|
2352
|
+
worker_id=worker_id,
|
|
2353
|
+
org_id=org_id,
|
|
2354
|
+
)
|
|
2355
|
+
return None
|
|
2356
|
+
|
|
2357
|
+
lock_data = json.loads(lock_data_str)
|
|
2358
|
+
|
|
2359
|
+
# Verify lock is held by this worker
|
|
2360
|
+
if lock_data.get("worker_id") != worker_id:
|
|
2361
|
+
logger.warning(
|
|
2362
|
+
"update_lock_ownership_mismatch",
|
|
2363
|
+
queue_id=queue_id,
|
|
2364
|
+
requesting_worker=worker_id,
|
|
2365
|
+
lock_holder=lock_data.get("worker_id"),
|
|
2366
|
+
org_id=org_id,
|
|
2367
|
+
)
|
|
2368
|
+
raise HTTPException(
|
|
2369
|
+
status_code=status.HTTP_403_FORBIDDEN,
|
|
2370
|
+
detail=f"Lock is held by another worker ({lock_data.get('worker_id')})"
|
|
2371
|
+
)
|
|
2372
|
+
|
|
2373
|
+
# Release the lock
|
|
2374
|
+
await redis_client.delete(lock_key)
|
|
2375
|
+
|
|
2376
|
+
logger.info(
|
|
2377
|
+
"update_lock_released",
|
|
2378
|
+
lock_id=lock_data.get("lock_id"),
|
|
2379
|
+
worker_id=worker_id,
|
|
2380
|
+
queue_id=queue_id,
|
|
2381
|
+
org_id=org_id,
|
|
2382
|
+
)
|
|
2383
|
+
|
|
2384
|
+
return None
|
|
2385
|
+
|
|
2386
|
+
except HTTPException:
|
|
2387
|
+
raise
|
|
2388
|
+
except Exception as e:
|
|
2389
|
+
logger.error(
|
|
2390
|
+
"update_lock_release_failed",
|
|
2391
|
+
error=str(e),
|
|
2392
|
+
queue_id=queue_id,
|
|
2393
|
+
worker_id=worker_id,
|
|
2394
|
+
)
|
|
2395
|
+
raise HTTPException(
|
|
2396
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
2397
|
+
detail=f"Failed to release update lock: {str(e)}"
|
|
2398
|
+
)
|
|
2399
|
+
|
|
2400
|
+
|
|
2401
|
+
@router.get("/worker-queues/{queue_id}/update-lock-status")
|
|
2402
|
+
@instrument_endpoint("worker_queues.get_update_lock_status")
|
|
2403
|
+
async def get_update_lock_status(
|
|
2404
|
+
queue_id: str,
|
|
2405
|
+
request: Request,
|
|
2406
|
+
organization: dict = Depends(get_current_organization),
|
|
2407
|
+
):
|
|
2408
|
+
"""
|
|
2409
|
+
Get the current update lock status for a queue.
|
|
2410
|
+
|
|
2411
|
+
Useful for checking if updates are in progress before triggering manual updates.
|
|
2412
|
+
|
|
2413
|
+
Args:
|
|
2414
|
+
queue_id: Worker queue ID
|
|
2415
|
+
|
|
2416
|
+
Returns:
|
|
2417
|
+
Lock status (locked/unlocked) and lock holder if locked
|
|
2418
|
+
"""
|
|
2419
|
+
try:
|
|
2420
|
+
org_id = organization["id"]
|
|
2421
|
+
redis_client = get_redis_client()
|
|
2422
|
+
|
|
2423
|
+
if not redis_client:
|
|
2424
|
+
return {
|
|
2425
|
+
"locked": False,
|
|
2426
|
+
"lock_coordination_available": False,
|
|
2427
|
+
"message": "Lock coordination unavailable (Redis not configured)",
|
|
2428
|
+
}
|
|
2429
|
+
|
|
2430
|
+
lock_key = f"worker_queue:{queue_id}:update_lock"
|
|
2431
|
+
lock_data_str = await redis_client.get(lock_key)
|
|
2432
|
+
|
|
2433
|
+
if not lock_data_str:
|
|
2434
|
+
return {
|
|
2435
|
+
"locked": False,
|
|
2436
|
+
"queue_id": queue_id,
|
|
2437
|
+
"lock_coordination_available": True,
|
|
2438
|
+
}
|
|
2439
|
+
|
|
2440
|
+
lock_data = json.loads(lock_data_str)
|
|
2441
|
+
|
|
2442
|
+
# Get TTL for expiration info
|
|
2443
|
+
ttl = await redis_client.ttl(lock_key)
|
|
2444
|
+
|
|
2445
|
+
return {
|
|
2446
|
+
"locked": True,
|
|
2447
|
+
"queue_id": queue_id,
|
|
2448
|
+
"worker_id": lock_data.get("worker_id"),
|
|
2449
|
+
"lock_id": lock_data.get("lock_id"),
|
|
2450
|
+
"acquired_at": lock_data.get("acquired_at"),
|
|
2451
|
+
"expires_at": lock_data.get("expires_at"),
|
|
2452
|
+
"ttl_seconds": ttl if ttl > 0 else 0,
|
|
2453
|
+
"lock_coordination_available": True,
|
|
2454
|
+
}
|
|
2455
|
+
|
|
2456
|
+
except Exception as e:
|
|
2457
|
+
logger.error(
|
|
2458
|
+
"update_lock_status_check_failed",
|
|
2459
|
+
error=str(e),
|
|
2460
|
+
queue_id=queue_id,
|
|
2461
|
+
)
|
|
2462
|
+
raise HTTPException(
|
|
2463
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
2464
|
+
detail=f"Failed to check lock status: {str(e)}"
|
|
2465
|
+
)
|
|
2466
|
+
|
|
2467
|
+
|
|
2468
|
+
@router.get("/worker-queues/{queue_id}/executions")
|
|
2469
|
+
@instrument_endpoint("worker_queues.list_queue_executions")
|
|
2470
|
+
async def list_queue_executions(
|
|
2471
|
+
queue_id: str,
|
|
2472
|
+
request: Request,
|
|
2473
|
+
limit: int = 10,
|
|
2474
|
+
status: str = "all",
|
|
2475
|
+
organization: dict = Depends(get_current_organization),
|
|
2476
|
+
db: Session = Depends(get_db),
|
|
2477
|
+
):
|
|
2478
|
+
"""
|
|
2479
|
+
List recent executions for a specific worker queue.
|
|
2480
|
+
|
|
2481
|
+
Used by workers in single-execution mode to monitor when their task completes.
|
|
2482
|
+
|
|
2483
|
+
Args:
|
|
2484
|
+
queue_id: Worker queue ID
|
|
2485
|
+
limit: Maximum number of executions to return (default: 10)
|
|
2486
|
+
status: Filter by status ('all', 'running', 'completed', 'failed', etc.)
|
|
2487
|
+
|
|
2488
|
+
Returns:
|
|
2489
|
+
List of executions for this queue
|
|
2490
|
+
"""
|
|
2491
|
+
try:
|
|
2492
|
+
org_id = organization["id"]
|
|
2493
|
+
|
|
2494
|
+
# Verify queue exists and belongs to this org
|
|
2495
|
+
queue = (
|
|
2496
|
+
db.query(WorkerQueue)
|
|
2497
|
+
.filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
|
|
2498
|
+
.first()
|
|
2499
|
+
)
|
|
2500
|
+
|
|
2501
|
+
if not queue:
|
|
2502
|
+
raise HTTPException(
|
|
2503
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
2504
|
+
detail="Worker queue not found"
|
|
2505
|
+
)
|
|
2506
|
+
|
|
2507
|
+
# Import Execution model
|
|
2508
|
+
from control_plane_api.app.models.execution import Execution
|
|
2509
|
+
|
|
2510
|
+
# Query executions for this queue
|
|
2511
|
+
query = db.query(Execution).filter(
|
|
2512
|
+
Execution.organization_id == org_id,
|
|
2513
|
+
Execution.worker_queue_id == queue_id
|
|
2514
|
+
)
|
|
2515
|
+
|
|
2516
|
+
# Filter by status if not 'all'
|
|
2517
|
+
if status != "all":
|
|
2518
|
+
query = query.filter(Execution.status == status)
|
|
2519
|
+
|
|
2520
|
+
# Order by created_at descending and limit
|
|
2521
|
+
executions = query.order_by(desc(Execution.created_at)).limit(limit).all()
|
|
2522
|
+
|
|
2523
|
+
# Convert to dict for JSON response
|
|
2524
|
+
result = []
|
|
2525
|
+
for execution in executions:
|
|
2526
|
+
result.append({
|
|
2527
|
+
"id": str(execution.id),
|
|
2528
|
+
"status": execution.status,
|
|
2529
|
+
"entity_id": str(execution.entity_id),
|
|
2530
|
+
"entity_name": execution.entity_name,
|
|
2531
|
+
"execution_type": execution.execution_type,
|
|
2532
|
+
"prompt": execution.prompt[:200] if execution.prompt else None, # Truncate for brevity
|
|
2533
|
+
"created_at": execution.created_at.isoformat() if execution.created_at else None,
|
|
2534
|
+
"started_at": execution.started_at.isoformat() if execution.started_at else None,
|
|
2535
|
+
"completed_at": execution.completed_at.isoformat() if execution.completed_at else None,
|
|
2536
|
+
"temporal_workflow_id": execution.temporal_workflow_id,
|
|
2537
|
+
})
|
|
2538
|
+
|
|
2539
|
+
logger.info(
|
|
2540
|
+
"queue_executions_listed",
|
|
2541
|
+
queue_id=queue_id,
|
|
2542
|
+
count=len(result),
|
|
2543
|
+
org_id=org_id,
|
|
2544
|
+
)
|
|
2545
|
+
|
|
2546
|
+
return result
|
|
2547
|
+
|
|
2548
|
+
except HTTPException:
|
|
2549
|
+
raise
|
|
2550
|
+
except Exception as e:
|
|
2551
|
+
logger.error("queue_executions_list_failed", error=str(e), queue_id=queue_id)
|
|
2552
|
+
raise HTTPException(
|
|
2553
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
2554
|
+
detail=f"Failed to list queue executions: {str(e)}"
|
|
2555
|
+
)
|