kubiya-control-plane-api 0.9.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- control_plane_api/LICENSE +676 -0
- control_plane_api/README.md +350 -0
- control_plane_api/__init__.py +4 -0
- control_plane_api/__version__.py +8 -0
- control_plane_api/alembic/README +1 -0
- control_plane_api/alembic/env.py +121 -0
- control_plane_api/alembic/script.py.mako +28 -0
- control_plane_api/alembic/versions/2613c65c3dbe_initial_database_setup.py +32 -0
- control_plane_api/alembic/versions/2df520d4927d_merge_heads.py +28 -0
- control_plane_api/alembic/versions/43abf98d6a01_add_paused_status_to_executions.py +73 -0
- control_plane_api/alembic/versions/6289854264cb_merge_multiple_heads.py +28 -0
- control_plane_api/alembic/versions/6a4d4dc3d8dc_generate_execution_transitions.py +50 -0
- control_plane_api/alembic/versions/87d11cf0a783_add_disconnected_status_to_worker_.py +44 -0
- control_plane_api/alembic/versions/add_ephemeral_queue_support.py +85 -0
- control_plane_api/alembic/versions/add_model_type_to_llm_models.py +31 -0
- control_plane_api/alembic/versions/add_plan_executions_table.py +114 -0
- control_plane_api/alembic/versions/add_trace_span_tables.py +154 -0
- control_plane_api/alembic/versions/add_user_info_to_traces.py +36 -0
- control_plane_api/alembic/versions/adjusting_foreign_keys.py +32 -0
- control_plane_api/alembic/versions/b4983d976db2_initial_tables.py +1128 -0
- control_plane_api/alembic/versions/d181a3b40e71_rename_custom_metadata_to_metadata_in_.py +50 -0
- control_plane_api/alembic/versions/df9117888e82_add_missing_columns.py +82 -0
- control_plane_api/alembic/versions/f25de6ad895a_missing_migrations.py +34 -0
- control_plane_api/alembic/versions/f71305fb69b9_fix_ephemeral_queue_deletion_foreign_key.py +54 -0
- control_plane_api/alembic/versions/mark_local_exec_queues_as_ephemeral.py +68 -0
- control_plane_api/alembic.ini +148 -0
- control_plane_api/api/index.py +12 -0
- control_plane_api/app/__init__.py +11 -0
- control_plane_api/app/activities/__init__.py +20 -0
- control_plane_api/app/activities/agent_activities.py +384 -0
- control_plane_api/app/activities/plan_generation_activities.py +499 -0
- control_plane_api/app/activities/team_activities.py +424 -0
- control_plane_api/app/activities/temporal_cloud_activities.py +588 -0
- control_plane_api/app/config/__init__.py +35 -0
- control_plane_api/app/config/api_config.py +469 -0
- control_plane_api/app/config/config_loader.py +224 -0
- control_plane_api/app/config/model_pricing.py +323 -0
- control_plane_api/app/config/storage_config.py +159 -0
- control_plane_api/app/config.py +115 -0
- control_plane_api/app/controllers/__init__.py +0 -0
- control_plane_api/app/controllers/execution_environment_controller.py +1315 -0
- control_plane_api/app/database.py +135 -0
- control_plane_api/app/exceptions.py +408 -0
- control_plane_api/app/lib/__init__.py +11 -0
- control_plane_api/app/lib/environment.py +65 -0
- control_plane_api/app/lib/event_bus/__init__.py +17 -0
- control_plane_api/app/lib/event_bus/base.py +136 -0
- control_plane_api/app/lib/event_bus/manager.py +335 -0
- control_plane_api/app/lib/event_bus/providers/__init__.py +6 -0
- control_plane_api/app/lib/event_bus/providers/http_provider.py +166 -0
- control_plane_api/app/lib/event_bus/providers/nats_provider.py +324 -0
- control_plane_api/app/lib/event_bus/providers/redis_provider.py +233 -0
- control_plane_api/app/lib/event_bus/providers/websocket_provider.py +497 -0
- control_plane_api/app/lib/job_executor.py +330 -0
- control_plane_api/app/lib/kubiya_client.py +293 -0
- control_plane_api/app/lib/litellm_pricing.py +166 -0
- control_plane_api/app/lib/mcp_validation.py +163 -0
- control_plane_api/app/lib/nats/__init__.py +13 -0
- control_plane_api/app/lib/nats/credentials_manager.py +288 -0
- control_plane_api/app/lib/nats/listener.py +374 -0
- control_plane_api/app/lib/planning_prompt_builder.py +153 -0
- control_plane_api/app/lib/planning_tools/__init__.py +41 -0
- control_plane_api/app/lib/planning_tools/agents.py +409 -0
- control_plane_api/app/lib/planning_tools/agno_toolkit.py +836 -0
- control_plane_api/app/lib/planning_tools/base.py +119 -0
- control_plane_api/app/lib/planning_tools/cognitive_memory_tools.py +403 -0
- control_plane_api/app/lib/planning_tools/context_graph_tools.py +545 -0
- control_plane_api/app/lib/planning_tools/environments.py +218 -0
- control_plane_api/app/lib/planning_tools/knowledge.py +204 -0
- control_plane_api/app/lib/planning_tools/models.py +93 -0
- control_plane_api/app/lib/planning_tools/planning_service.py +646 -0
- control_plane_api/app/lib/planning_tools/resources.py +242 -0
- control_plane_api/app/lib/planning_tools/teams.py +334 -0
- control_plane_api/app/lib/policy_enforcer_client.py +1016 -0
- control_plane_api/app/lib/redis_client.py +803 -0
- control_plane_api/app/lib/sqlalchemy_utils.py +486 -0
- control_plane_api/app/lib/state_transition_tools/__init__.py +7 -0
- control_plane_api/app/lib/state_transition_tools/execution_context.py +388 -0
- control_plane_api/app/lib/storage/__init__.py +20 -0
- control_plane_api/app/lib/storage/base_provider.py +274 -0
- control_plane_api/app/lib/storage/provider_factory.py +157 -0
- control_plane_api/app/lib/storage/vercel_blob_provider.py +468 -0
- control_plane_api/app/lib/supabase.py +71 -0
- control_plane_api/app/lib/supabase_utils.py +138 -0
- control_plane_api/app/lib/task_planning/__init__.py +138 -0
- control_plane_api/app/lib/task_planning/agent_factory.py +308 -0
- control_plane_api/app/lib/task_planning/agents.py +389 -0
- control_plane_api/app/lib/task_planning/cache.py +218 -0
- control_plane_api/app/lib/task_planning/entity_resolver.py +273 -0
- control_plane_api/app/lib/task_planning/helpers.py +293 -0
- control_plane_api/app/lib/task_planning/hooks.py +474 -0
- control_plane_api/app/lib/task_planning/models.py +503 -0
- control_plane_api/app/lib/task_planning/plan_validator.py +166 -0
- control_plane_api/app/lib/task_planning/planning_workflow.py +2911 -0
- control_plane_api/app/lib/task_planning/runner.py +656 -0
- control_plane_api/app/lib/task_planning/streaming_hook.py +213 -0
- control_plane_api/app/lib/task_planning/workflow.py +424 -0
- control_plane_api/app/lib/templating/__init__.py +88 -0
- control_plane_api/app/lib/templating/compiler.py +278 -0
- control_plane_api/app/lib/templating/engine.py +178 -0
- control_plane_api/app/lib/templating/parsers/__init__.py +29 -0
- control_plane_api/app/lib/templating/parsers/base.py +96 -0
- control_plane_api/app/lib/templating/parsers/env.py +85 -0
- control_plane_api/app/lib/templating/parsers/graph.py +112 -0
- control_plane_api/app/lib/templating/parsers/secret.py +87 -0
- control_plane_api/app/lib/templating/parsers/simple.py +81 -0
- control_plane_api/app/lib/templating/resolver.py +366 -0
- control_plane_api/app/lib/templating/types.py +214 -0
- control_plane_api/app/lib/templating/validator.py +201 -0
- control_plane_api/app/lib/temporal_client.py +232 -0
- control_plane_api/app/lib/temporal_credentials_cache.py +178 -0
- control_plane_api/app/lib/temporal_credentials_service.py +203 -0
- control_plane_api/app/lib/validation/__init__.py +24 -0
- control_plane_api/app/lib/validation/runtime_validation.py +388 -0
- control_plane_api/app/main.py +531 -0
- control_plane_api/app/middleware/__init__.py +10 -0
- control_plane_api/app/middleware/auth.py +645 -0
- control_plane_api/app/middleware/exception_handler.py +267 -0
- control_plane_api/app/middleware/prometheus_middleware.py +173 -0
- control_plane_api/app/middleware/rate_limiting.py +384 -0
- control_plane_api/app/middleware/request_id.py +202 -0
- control_plane_api/app/models/__init__.py +40 -0
- control_plane_api/app/models/agent.py +90 -0
- control_plane_api/app/models/analytics.py +206 -0
- control_plane_api/app/models/associations.py +107 -0
- control_plane_api/app/models/auth_user.py +73 -0
- control_plane_api/app/models/context.py +161 -0
- control_plane_api/app/models/custom_integration.py +99 -0
- control_plane_api/app/models/environment.py +64 -0
- control_plane_api/app/models/execution.py +125 -0
- control_plane_api/app/models/execution_transition.py +50 -0
- control_plane_api/app/models/job.py +159 -0
- control_plane_api/app/models/llm_model.py +78 -0
- control_plane_api/app/models/orchestration.py +66 -0
- control_plane_api/app/models/plan_execution.py +102 -0
- control_plane_api/app/models/presence.py +49 -0
- control_plane_api/app/models/project.py +61 -0
- control_plane_api/app/models/project_management.py +85 -0
- control_plane_api/app/models/session.py +29 -0
- control_plane_api/app/models/skill.py +155 -0
- control_plane_api/app/models/system_tables.py +43 -0
- control_plane_api/app/models/task_planning.py +372 -0
- control_plane_api/app/models/team.py +86 -0
- control_plane_api/app/models/trace.py +257 -0
- control_plane_api/app/models/user_profile.py +54 -0
- control_plane_api/app/models/worker.py +221 -0
- control_plane_api/app/models/workflow.py +161 -0
- control_plane_api/app/models/workspace.py +50 -0
- control_plane_api/app/observability/__init__.py +177 -0
- control_plane_api/app/observability/context_logging.py +475 -0
- control_plane_api/app/observability/decorators.py +337 -0
- control_plane_api/app/observability/local_span_processor.py +702 -0
- control_plane_api/app/observability/metrics.py +303 -0
- control_plane_api/app/observability/middleware.py +246 -0
- control_plane_api/app/observability/optional.py +115 -0
- control_plane_api/app/observability/tracing.py +382 -0
- control_plane_api/app/policies/README.md +149 -0
- control_plane_api/app/policies/approved_users.rego +62 -0
- control_plane_api/app/policies/business_hours.rego +51 -0
- control_plane_api/app/policies/rate_limiting.rego +100 -0
- control_plane_api/app/policies/tool_enforcement/README.md +336 -0
- control_plane_api/app/policies/tool_enforcement/bash_command_validation.rego +71 -0
- control_plane_api/app/policies/tool_enforcement/business_hours_enforcement.rego +82 -0
- control_plane_api/app/policies/tool_enforcement/mcp_tool_allowlist.rego +58 -0
- control_plane_api/app/policies/tool_enforcement/production_safeguards.rego +80 -0
- control_plane_api/app/policies/tool_enforcement/role_based_tool_access.rego +44 -0
- control_plane_api/app/policies/tool_restrictions.rego +86 -0
- control_plane_api/app/routers/__init__.py +4 -0
- control_plane_api/app/routers/agents.py +382 -0
- control_plane_api/app/routers/agents_v2.py +1598 -0
- control_plane_api/app/routers/analytics.py +1310 -0
- control_plane_api/app/routers/auth.py +59 -0
- control_plane_api/app/routers/client_config.py +57 -0
- control_plane_api/app/routers/context_graph.py +561 -0
- control_plane_api/app/routers/context_manager.py +577 -0
- control_plane_api/app/routers/custom_integrations.py +490 -0
- control_plane_api/app/routers/enforcer.py +132 -0
- control_plane_api/app/routers/environment_context.py +252 -0
- control_plane_api/app/routers/environments.py +761 -0
- control_plane_api/app/routers/execution_environment.py +847 -0
- control_plane_api/app/routers/executions/__init__.py +28 -0
- control_plane_api/app/routers/executions/router.py +286 -0
- control_plane_api/app/routers/executions/services/__init__.py +22 -0
- control_plane_api/app/routers/executions/services/demo_worker_health.py +156 -0
- control_plane_api/app/routers/executions/services/status_service.py +420 -0
- control_plane_api/app/routers/executions/services/test_worker_health.py +480 -0
- control_plane_api/app/routers/executions/services/worker_health.py +514 -0
- control_plane_api/app/routers/executions/streaming/__init__.py +22 -0
- control_plane_api/app/routers/executions/streaming/deduplication.py +352 -0
- control_plane_api/app/routers/executions/streaming/event_buffer.py +353 -0
- control_plane_api/app/routers/executions/streaming/event_formatter.py +964 -0
- control_plane_api/app/routers/executions/streaming/history_loader.py +588 -0
- control_plane_api/app/routers/executions/streaming/live_source.py +693 -0
- control_plane_api/app/routers/executions/streaming/streamer.py +849 -0
- control_plane_api/app/routers/executions.py +4888 -0
- control_plane_api/app/routers/health.py +165 -0
- control_plane_api/app/routers/health_v2.py +394 -0
- control_plane_api/app/routers/integration_templates.py +496 -0
- control_plane_api/app/routers/integrations.py +287 -0
- control_plane_api/app/routers/jobs.py +1809 -0
- control_plane_api/app/routers/metrics.py +517 -0
- control_plane_api/app/routers/models.py +82 -0
- control_plane_api/app/routers/models_v2.py +628 -0
- control_plane_api/app/routers/plan_executions.py +1481 -0
- control_plane_api/app/routers/plan_generation_async.py +304 -0
- control_plane_api/app/routers/policies.py +669 -0
- control_plane_api/app/routers/presence.py +234 -0
- control_plane_api/app/routers/projects.py +987 -0
- control_plane_api/app/routers/runners.py +379 -0
- control_plane_api/app/routers/runtimes.py +172 -0
- control_plane_api/app/routers/secrets.py +171 -0
- control_plane_api/app/routers/skills.py +1010 -0
- control_plane_api/app/routers/skills_definitions.py +140 -0
- control_plane_api/app/routers/storage.py +456 -0
- control_plane_api/app/routers/task_planning.py +611 -0
- control_plane_api/app/routers/task_queues.py +650 -0
- control_plane_api/app/routers/team_context.py +274 -0
- control_plane_api/app/routers/teams.py +1747 -0
- control_plane_api/app/routers/templates.py +248 -0
- control_plane_api/app/routers/traces.py +571 -0
- control_plane_api/app/routers/websocket_client.py +479 -0
- control_plane_api/app/routers/websocket_executions_status.py +437 -0
- control_plane_api/app/routers/websocket_gateway.py +323 -0
- control_plane_api/app/routers/websocket_traces.py +576 -0
- control_plane_api/app/routers/worker_queues.py +2555 -0
- control_plane_api/app/routers/worker_websocket.py +419 -0
- control_plane_api/app/routers/workers.py +1004 -0
- control_plane_api/app/routers/workflows.py +204 -0
- control_plane_api/app/runtimes/__init__.py +6 -0
- control_plane_api/app/runtimes/validation.py +344 -0
- control_plane_api/app/schemas/__init__.py +1 -0
- control_plane_api/app/schemas/job_schemas.py +302 -0
- control_plane_api/app/schemas/mcp_schemas.py +311 -0
- control_plane_api/app/schemas/template_schemas.py +133 -0
- control_plane_api/app/schemas/trace_schemas.py +168 -0
- control_plane_api/app/schemas/worker_queue_observability_schemas.py +165 -0
- control_plane_api/app/services/__init__.py +1 -0
- control_plane_api/app/services/agno_planning_strategy.py +233 -0
- control_plane_api/app/services/agno_service.py +838 -0
- control_plane_api/app/services/claude_code_planning_service.py +203 -0
- control_plane_api/app/services/context_graph_client.py +224 -0
- control_plane_api/app/services/custom_integration_service.py +415 -0
- control_plane_api/app/services/integration_resolution_service.py +345 -0
- control_plane_api/app/services/litellm_service.py +394 -0
- control_plane_api/app/services/plan_generator.py +79 -0
- control_plane_api/app/services/planning_strategy.py +66 -0
- control_plane_api/app/services/planning_strategy_factory.py +118 -0
- control_plane_api/app/services/policy_service.py +615 -0
- control_plane_api/app/services/state_transition_service.py +755 -0
- control_plane_api/app/services/storage_service.py +593 -0
- control_plane_api/app/services/temporal_cloud_provisioning.py +150 -0
- control_plane_api/app/services/toolsets/context_graph_skill.py +432 -0
- control_plane_api/app/services/trace_retention.py +354 -0
- control_plane_api/app/services/worker_queue_metrics_service.py +190 -0
- control_plane_api/app/services/workflow_cancellation_manager.py +135 -0
- control_plane_api/app/services/workflow_operations_service.py +611 -0
- control_plane_api/app/skills/__init__.py +100 -0
- control_plane_api/app/skills/base.py +239 -0
- control_plane_api/app/skills/builtin/__init__.py +37 -0
- control_plane_api/app/skills/builtin/agent_communication/__init__.py +8 -0
- control_plane_api/app/skills/builtin/agent_communication/skill.py +246 -0
- control_plane_api/app/skills/builtin/code_ingestion/__init__.py +4 -0
- control_plane_api/app/skills/builtin/code_ingestion/skill.py +267 -0
- control_plane_api/app/skills/builtin/cognitive_memory/__init__.py +4 -0
- control_plane_api/app/skills/builtin/cognitive_memory/skill.py +174 -0
- control_plane_api/app/skills/builtin/contextual_awareness/__init__.py +4 -0
- control_plane_api/app/skills/builtin/contextual_awareness/skill.py +387 -0
- control_plane_api/app/skills/builtin/data_visualization/__init__.py +4 -0
- control_plane_api/app/skills/builtin/data_visualization/skill.py +154 -0
- control_plane_api/app/skills/builtin/docker/__init__.py +4 -0
- control_plane_api/app/skills/builtin/docker/skill.py +104 -0
- control_plane_api/app/skills/builtin/file_generation/__init__.py +4 -0
- control_plane_api/app/skills/builtin/file_generation/skill.py +94 -0
- control_plane_api/app/skills/builtin/file_system/__init__.py +4 -0
- control_plane_api/app/skills/builtin/file_system/skill.py +110 -0
- control_plane_api/app/skills/builtin/knowledge_api/__init__.py +5 -0
- control_plane_api/app/skills/builtin/knowledge_api/skill.py +124 -0
- control_plane_api/app/skills/builtin/python/__init__.py +4 -0
- control_plane_api/app/skills/builtin/python/skill.py +92 -0
- control_plane_api/app/skills/builtin/remote_filesystem/__init__.py +5 -0
- control_plane_api/app/skills/builtin/remote_filesystem/skill.py +170 -0
- control_plane_api/app/skills/builtin/shell/__init__.py +4 -0
- control_plane_api/app/skills/builtin/shell/skill.py +161 -0
- control_plane_api/app/skills/builtin/slack/__init__.py +3 -0
- control_plane_api/app/skills/builtin/slack/skill.py +302 -0
- control_plane_api/app/skills/builtin/workflow_executor/__init__.py +4 -0
- control_plane_api/app/skills/builtin/workflow_executor/skill.py +469 -0
- control_plane_api/app/skills/business_intelligence.py +189 -0
- control_plane_api/app/skills/config.py +63 -0
- control_plane_api/app/skills/loaders/__init__.py +14 -0
- control_plane_api/app/skills/loaders/base.py +73 -0
- control_plane_api/app/skills/loaders/filesystem_loader.py +199 -0
- control_plane_api/app/skills/registry.py +125 -0
- control_plane_api/app/utils/helpers.py +12 -0
- control_plane_api/app/utils/workflow_executor.py +354 -0
- control_plane_api/app/workflows/__init__.py +11 -0
- control_plane_api/app/workflows/agent_execution.py +520 -0
- control_plane_api/app/workflows/agent_execution_with_skills.py +223 -0
- control_plane_api/app/workflows/namespace_provisioning.py +326 -0
- control_plane_api/app/workflows/plan_generation.py +254 -0
- control_plane_api/app/workflows/team_execution.py +442 -0
- control_plane_api/scripts/seed_models.py +240 -0
- control_plane_api/scripts/validate_existing_tool_names.py +492 -0
- control_plane_api/shared/__init__.py +8 -0
- control_plane_api/shared/version.py +17 -0
- control_plane_api/test_deduplication.py +274 -0
- control_plane_api/test_executor_deduplication_e2e.py +309 -0
- control_plane_api/test_job_execution_e2e.py +283 -0
- control_plane_api/test_real_integration.py +193 -0
- control_plane_api/version.py +38 -0
- control_plane_api/worker/__init__.py +0 -0
- control_plane_api/worker/activities/__init__.py +0 -0
- control_plane_api/worker/activities/agent_activities.py +1585 -0
- control_plane_api/worker/activities/approval_activities.py +234 -0
- control_plane_api/worker/activities/job_activities.py +199 -0
- control_plane_api/worker/activities/runtime_activities.py +1167 -0
- control_plane_api/worker/activities/skill_activities.py +282 -0
- control_plane_api/worker/activities/team_activities.py +479 -0
- control_plane_api/worker/agent_runtime_server.py +370 -0
- control_plane_api/worker/binary_manager.py +333 -0
- control_plane_api/worker/config/__init__.py +31 -0
- control_plane_api/worker/config/worker_config.py +273 -0
- control_plane_api/worker/control_plane_client.py +1491 -0
- control_plane_api/worker/examples/analytics_integration_example.py +362 -0
- control_plane_api/worker/health_monitor.py +159 -0
- control_plane_api/worker/metrics.py +237 -0
- control_plane_api/worker/models/__init__.py +1 -0
- control_plane_api/worker/models/error_events.py +105 -0
- control_plane_api/worker/models/inputs.py +89 -0
- control_plane_api/worker/runtimes/__init__.py +35 -0
- control_plane_api/worker/runtimes/agent_runtime/runtime.py +485 -0
- control_plane_api/worker/runtimes/agno/__init__.py +34 -0
- control_plane_api/worker/runtimes/agno/config.py +248 -0
- control_plane_api/worker/runtimes/agno/hooks.py +385 -0
- control_plane_api/worker/runtimes/agno/mcp_builder.py +195 -0
- control_plane_api/worker/runtimes/agno/runtime.py +1063 -0
- control_plane_api/worker/runtimes/agno/utils.py +163 -0
- control_plane_api/worker/runtimes/base.py +979 -0
- control_plane_api/worker/runtimes/claude_code/__init__.py +38 -0
- control_plane_api/worker/runtimes/claude_code/cleanup.py +184 -0
- control_plane_api/worker/runtimes/claude_code/client_pool.py +529 -0
- control_plane_api/worker/runtimes/claude_code/config.py +829 -0
- control_plane_api/worker/runtimes/claude_code/hooks.py +482 -0
- control_plane_api/worker/runtimes/claude_code/litellm_proxy.py +1702 -0
- control_plane_api/worker/runtimes/claude_code/mcp_builder.py +467 -0
- control_plane_api/worker/runtimes/claude_code/mcp_discovery.py +558 -0
- control_plane_api/worker/runtimes/claude_code/runtime.py +1546 -0
- control_plane_api/worker/runtimes/claude_code/tool_mapper.py +403 -0
- control_plane_api/worker/runtimes/claude_code/utils.py +149 -0
- control_plane_api/worker/runtimes/factory.py +173 -0
- control_plane_api/worker/runtimes/model_utils.py +107 -0
- control_plane_api/worker/runtimes/validation.py +93 -0
- control_plane_api/worker/services/__init__.py +1 -0
- control_plane_api/worker/services/agent_communication_tools.py +908 -0
- control_plane_api/worker/services/agent_executor.py +485 -0
- control_plane_api/worker/services/agent_executor_v2.py +793 -0
- control_plane_api/worker/services/analytics_collector.py +457 -0
- control_plane_api/worker/services/analytics_service.py +464 -0
- control_plane_api/worker/services/approval_tools.py +310 -0
- control_plane_api/worker/services/approval_tools_agno.py +207 -0
- control_plane_api/worker/services/cancellation_manager.py +177 -0
- control_plane_api/worker/services/code_ingestion_tools.py +465 -0
- control_plane_api/worker/services/contextual_awareness_tools.py +405 -0
- control_plane_api/worker/services/data_visualization.py +834 -0
- control_plane_api/worker/services/event_publisher.py +531 -0
- control_plane_api/worker/services/jira_tools.py +257 -0
- control_plane_api/worker/services/remote_filesystem_tools.py +498 -0
- control_plane_api/worker/services/runtime_analytics.py +328 -0
- control_plane_api/worker/services/session_service.py +365 -0
- control_plane_api/worker/services/skill_context_enhancement.py +181 -0
- control_plane_api/worker/services/skill_factory.py +471 -0
- control_plane_api/worker/services/system_prompt_enhancement.py +410 -0
- control_plane_api/worker/services/team_executor.py +715 -0
- control_plane_api/worker/services/team_executor_v2.py +1866 -0
- control_plane_api/worker/services/tool_enforcement.py +254 -0
- control_plane_api/worker/services/workflow_executor/__init__.py +52 -0
- control_plane_api/worker/services/workflow_executor/event_processor.py +287 -0
- control_plane_api/worker/services/workflow_executor/event_publisher.py +210 -0
- control_plane_api/worker/services/workflow_executor/executors/__init__.py +15 -0
- control_plane_api/worker/services/workflow_executor/executors/base.py +270 -0
- control_plane_api/worker/services/workflow_executor/executors/json_executor.py +50 -0
- control_plane_api/worker/services/workflow_executor/executors/python_executor.py +50 -0
- control_plane_api/worker/services/workflow_executor/models.py +142 -0
- control_plane_api/worker/services/workflow_executor_tools.py +1748 -0
- control_plane_api/worker/skills/__init__.py +12 -0
- control_plane_api/worker/skills/builtin/context_graph_search/README.md +213 -0
- control_plane_api/worker/skills/builtin/context_graph_search/__init__.py +5 -0
- control_plane_api/worker/skills/builtin/context_graph_search/agno_impl.py +808 -0
- control_plane_api/worker/skills/builtin/context_graph_search/skill.yaml +67 -0
- control_plane_api/worker/skills/builtin/contextual_awareness/__init__.py +4 -0
- control_plane_api/worker/skills/builtin/contextual_awareness/agno_impl.py +62 -0
- control_plane_api/worker/skills/builtin/data_visualization/agno_impl.py +18 -0
- control_plane_api/worker/skills/builtin/data_visualization/skill.yaml +84 -0
- control_plane_api/worker/skills/builtin/docker/agno_impl.py +65 -0
- control_plane_api/worker/skills/builtin/docker/skill.yaml +60 -0
- control_plane_api/worker/skills/builtin/file_generation/agno_impl.py +47 -0
- control_plane_api/worker/skills/builtin/file_generation/skill.yaml +64 -0
- control_plane_api/worker/skills/builtin/file_system/agno_impl.py +32 -0
- control_plane_api/worker/skills/builtin/file_system/skill.yaml +54 -0
- control_plane_api/worker/skills/builtin/knowledge_api/__init__.py +4 -0
- control_plane_api/worker/skills/builtin/knowledge_api/agno_impl.py +50 -0
- control_plane_api/worker/skills/builtin/knowledge_api/skill.yaml +66 -0
- control_plane_api/worker/skills/builtin/python/agno_impl.py +25 -0
- control_plane_api/worker/skills/builtin/python/skill.yaml +60 -0
- control_plane_api/worker/skills/builtin/schema_fix_mixin.py +260 -0
- control_plane_api/worker/skills/builtin/shell/agno_impl.py +31 -0
- control_plane_api/worker/skills/builtin/shell/skill.yaml +60 -0
- control_plane_api/worker/skills/builtin/slack/__init__.py +3 -0
- control_plane_api/worker/skills/builtin/slack/agno_impl.py +1282 -0
- control_plane_api/worker/skills/builtin/slack/skill.yaml +276 -0
- control_plane_api/worker/skills/builtin/workflow_executor/agno_impl.py +62 -0
- control_plane_api/worker/skills/builtin/workflow_executor/skill.yaml +79 -0
- control_plane_api/worker/skills/loaders/__init__.py +5 -0
- control_plane_api/worker/skills/loaders/base.py +23 -0
- control_plane_api/worker/skills/loaders/filesystem_loader.py +357 -0
- control_plane_api/worker/skills/registry.py +208 -0
- control_plane_api/worker/tests/__init__.py +1 -0
- control_plane_api/worker/tests/conftest.py +12 -0
- control_plane_api/worker/tests/e2e/__init__.py +0 -0
- control_plane_api/worker/tests/e2e/test_context_graph_real_api.py +338 -0
- control_plane_api/worker/tests/e2e/test_context_graph_templates_e2e.py +523 -0
- control_plane_api/worker/tests/e2e/test_enforcement_e2e.py +344 -0
- control_plane_api/worker/tests/e2e/test_execution_flow.py +571 -0
- control_plane_api/worker/tests/e2e/test_single_execution_mode.py +656 -0
- control_plane_api/worker/tests/integration/__init__.py +0 -0
- control_plane_api/worker/tests/integration/test_builtin_skills_fixes.py +245 -0
- control_plane_api/worker/tests/integration/test_context_graph_search_integration.py +365 -0
- control_plane_api/worker/tests/integration/test_control_plane_integration.py +308 -0
- control_plane_api/worker/tests/integration/test_hook_enforcement_integration.py +579 -0
- control_plane_api/worker/tests/integration/test_scheduled_job_workflow.py +237 -0
- control_plane_api/worker/tests/integration/test_system_prompt_enhancement_integration.py +343 -0
- control_plane_api/worker/tests/unit/__init__.py +0 -0
- control_plane_api/worker/tests/unit/test_builtin_skill_autoload.py +396 -0
- control_plane_api/worker/tests/unit/test_context_graph_search.py +450 -0
- control_plane_api/worker/tests/unit/test_context_graph_templates.py +403 -0
- control_plane_api/worker/tests/unit/test_control_plane_client.py +401 -0
- control_plane_api/worker/tests/unit/test_control_plane_client_jobs.py +345 -0
- control_plane_api/worker/tests/unit/test_job_activities.py +353 -0
- control_plane_api/worker/tests/unit/test_skill_context_enhancement.py +321 -0
- control_plane_api/worker/tests/unit/test_system_prompt_enhancement.py +415 -0
- control_plane_api/worker/tests/unit/test_tool_enforcement.py +324 -0
- control_plane_api/worker/utils/__init__.py +1 -0
- control_plane_api/worker/utils/chunk_batcher.py +330 -0
- control_plane_api/worker/utils/environment.py +65 -0
- control_plane_api/worker/utils/error_publisher.py +260 -0
- control_plane_api/worker/utils/event_batcher.py +256 -0
- control_plane_api/worker/utils/logging_config.py +335 -0
- control_plane_api/worker/utils/logging_helper.py +326 -0
- control_plane_api/worker/utils/parameter_validator.py +120 -0
- control_plane_api/worker/utils/retry_utils.py +60 -0
- control_plane_api/worker/utils/streaming_utils.py +665 -0
- control_plane_api/worker/utils/tool_validation.py +332 -0
- control_plane_api/worker/utils/workspace_manager.py +163 -0
- control_plane_api/worker/websocket_client.py +393 -0
- control_plane_api/worker/worker.py +1297 -0
- control_plane_api/worker/workflows/__init__.py +0 -0
- control_plane_api/worker/workflows/agent_execution.py +909 -0
- control_plane_api/worker/workflows/scheduled_job_wrapper.py +332 -0
- control_plane_api/worker/workflows/team_execution.py +611 -0
- kubiya_control_plane_api-0.9.15.dist-info/METADATA +354 -0
- kubiya_control_plane_api-0.9.15.dist-info/RECORD +479 -0
- kubiya_control_plane_api-0.9.15.dist-info/WHEEL +5 -0
- kubiya_control_plane_api-0.9.15.dist-info/entry_points.txt +5 -0
- kubiya_control_plane_api-0.9.15.dist-info/licenses/LICENSE +676 -0
- kubiya_control_plane_api-0.9.15.dist-info/top_level.txt +3 -0
- scripts/__init__.py +1 -0
- scripts/migrations.py +39 -0
- scripts/seed_worker_queues.py +128 -0
- scripts/setup_agent_runtime.py +142 -0
- worker_internal/__init__.py +1 -0
- worker_internal/planner/__init__.py +1 -0
- worker_internal/planner/activities.py +1499 -0
- worker_internal/planner/agent_tools.py +197 -0
- worker_internal/planner/event_models.py +148 -0
- worker_internal/planner/event_publisher.py +67 -0
- worker_internal/planner/models.py +199 -0
- worker_internal/planner/retry_logic.py +134 -0
- worker_internal/planner/worker.py +300 -0
- worker_internal/planner/workflows.py +970 -0
|
@@ -0,0 +1,1004 @@
|
|
|
1
|
+
"""Workers endpoint - shows registered Temporal workers and handles worker registration"""
|
|
2
|
+
|
|
3
|
+
from fastapi import APIRouter, Depends, HTTPException, status, Request
|
|
4
|
+
from typing import List, Dict, Any, Optional
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from sqlalchemy.orm import Session, joinedload
|
|
8
|
+
import structlog
|
|
9
|
+
import uuid
|
|
10
|
+
import json
|
|
11
|
+
|
|
12
|
+
from control_plane_api.app.middleware.auth import get_current_organization
|
|
13
|
+
from control_plane_api.app.lib.temporal_client import get_temporal_client
|
|
14
|
+
from control_plane_api.app.database import get_db
|
|
15
|
+
from control_plane_api.app.lib.redis_client import get_redis_client
|
|
16
|
+
from control_plane_api.app.models.worker import WorkerHeartbeat, WorkerQueue
|
|
17
|
+
from control_plane_api.app.models.environment import Environment
|
|
18
|
+
from control_plane_api.app.observability import (
|
|
19
|
+
instrument_endpoint,
|
|
20
|
+
create_span_with_context,
|
|
21
|
+
add_span_event,
|
|
22
|
+
add_span_error,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = structlog.get_logger()
|
|
26
|
+
|
|
27
|
+
router = APIRouter()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class WorkerInfo(BaseModel):
|
|
31
|
+
"""Worker information"""
|
|
32
|
+
identity: str
|
|
33
|
+
last_access_time: str | None
|
|
34
|
+
rate_per_second: float | None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TaskQueueInfo(BaseModel):
|
|
38
|
+
"""Task queue with worker information"""
|
|
39
|
+
task_queue: str
|
|
40
|
+
organization_id: str
|
|
41
|
+
runner_name: str
|
|
42
|
+
workers: List[WorkerInfo]
|
|
43
|
+
worker_count: int
|
|
44
|
+
approximate_backlog_count: int | None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@router.get("", response_model=List[TaskQueueInfo])
|
|
48
|
+
@instrument_endpoint("workers.list_workers")
|
|
49
|
+
async def list_workers(
|
|
50
|
+
request: Request,
|
|
51
|
+
organization: dict = Depends(get_current_organization),
|
|
52
|
+
):
|
|
53
|
+
"""
|
|
54
|
+
List registered Temporal workers for the organization.
|
|
55
|
+
|
|
56
|
+
This queries Temporal to get all task queues for the organization
|
|
57
|
+
and returns information about registered workers on each queue.
|
|
58
|
+
|
|
59
|
+
Task queue naming convention: {organization_id}.{runner_name}
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
temporal_client = await get_temporal_client()
|
|
63
|
+
org_id = organization["id"]
|
|
64
|
+
|
|
65
|
+
# Get runners from Kubiya API to know which task queues to check
|
|
66
|
+
from control_plane_api.app.lib.kubiya_client import get_kubiya_client
|
|
67
|
+
kubiya_client = get_kubiya_client()
|
|
68
|
+
token = request.state.kubiya_token
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
runners = await kubiya_client.get_runners(token, org_id)
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.warning(
|
|
74
|
+
"failed_to_fetch_kubiya_runners",
|
|
75
|
+
error=str(e),
|
|
76
|
+
org_id=org_id
|
|
77
|
+
)
|
|
78
|
+
# If we can't get runners from Kubiya, fall back to checking common ones
|
|
79
|
+
runners = [{"name": "default"}]
|
|
80
|
+
|
|
81
|
+
environments_info = []
|
|
82
|
+
|
|
83
|
+
for runner in runners:
|
|
84
|
+
# Runner might be a dict or a string
|
|
85
|
+
if isinstance(runner, dict):
|
|
86
|
+
runner_name = runner.get("name", "default")
|
|
87
|
+
else:
|
|
88
|
+
runner_name = str(runner) if runner else "default"
|
|
89
|
+
|
|
90
|
+
task_queue = f"{org_id}.{runner_name}"
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# Describe the task queue to get worker information
|
|
94
|
+
desc = await temporal_client.describe_task_queue(
|
|
95
|
+
task_queue=task_queue,
|
|
96
|
+
task_queue_type=1, # TaskQueueType.WORKFLOW
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
workers = []
|
|
100
|
+
approximate_backlog = None
|
|
101
|
+
|
|
102
|
+
# Extract worker information from pollers
|
|
103
|
+
if desc.pollers:
|
|
104
|
+
for poller in desc.pollers:
|
|
105
|
+
worker_info = WorkerInfo(
|
|
106
|
+
identity=poller.identity,
|
|
107
|
+
last_access_time=poller.last_access_time.isoformat() if poller.last_access_time else None,
|
|
108
|
+
rate_per_second=poller.rate_per_second if hasattr(poller, 'rate_per_second') else None,
|
|
109
|
+
)
|
|
110
|
+
workers.append(worker_info)
|
|
111
|
+
|
|
112
|
+
# Get approximate backlog count if available
|
|
113
|
+
if hasattr(desc, 'approximate_backlog_count'):
|
|
114
|
+
approximate_backlog = desc.approximate_backlog_count
|
|
115
|
+
|
|
116
|
+
task_queue_info = TaskQueueInfo(
|
|
117
|
+
task_queue=task_queue,
|
|
118
|
+
organization_id=org_id,
|
|
119
|
+
runner_name=runner_name,
|
|
120
|
+
workers=workers,
|
|
121
|
+
worker_count=len(workers),
|
|
122
|
+
approximate_backlog_count=approximate_backlog,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
environments_info.append(task_queue_info)
|
|
126
|
+
|
|
127
|
+
logger.info(
|
|
128
|
+
"task_queue_described",
|
|
129
|
+
task_queue=task_queue,
|
|
130
|
+
worker_count=len(workers),
|
|
131
|
+
org_id=org_id,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
# Task queue might not exist yet if no worker has registered
|
|
136
|
+
logger.debug(
|
|
137
|
+
"task_queue_not_found",
|
|
138
|
+
task_queue=task_queue,
|
|
139
|
+
error=str(e),
|
|
140
|
+
org_id=org_id,
|
|
141
|
+
)
|
|
142
|
+
# Add empty task queue info
|
|
143
|
+
task_queue_info = TaskQueueInfo(
|
|
144
|
+
task_queue=task_queue,
|
|
145
|
+
organization_id=org_id,
|
|
146
|
+
runner_name=runner_name,
|
|
147
|
+
workers=[],
|
|
148
|
+
worker_count=0,
|
|
149
|
+
approximate_backlog_count=None,
|
|
150
|
+
)
|
|
151
|
+
environments_info.append(task_queue_info)
|
|
152
|
+
|
|
153
|
+
logger.info(
|
|
154
|
+
"workers_listed",
|
|
155
|
+
org_id=org_id,
|
|
156
|
+
task_queue_count=len(environments_info),
|
|
157
|
+
total_workers=sum(tq.worker_count for tq in environments_info),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return environments_info
|
|
161
|
+
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.error(
|
|
164
|
+
"workers_list_failed",
|
|
165
|
+
error=str(e),
|
|
166
|
+
org_id=organization["id"]
|
|
167
|
+
)
|
|
168
|
+
raise HTTPException(
|
|
169
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
170
|
+
detail=f"Failed to list workers: {str(e)}"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@router.get("/{runner_name}", response_model=TaskQueueInfo)
|
|
175
|
+
@instrument_endpoint("workers.get_workers_for_runner")
|
|
176
|
+
async def get_workers_for_runner(
|
|
177
|
+
runner_name: str,
|
|
178
|
+
request: Request,
|
|
179
|
+
organization: dict = Depends(get_current_organization),
|
|
180
|
+
):
|
|
181
|
+
"""
|
|
182
|
+
Get worker information for a specific runner.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
runner_name: The runner name (e.g., "default", "production-runner")
|
|
186
|
+
"""
|
|
187
|
+
try:
|
|
188
|
+
temporal_client = await get_temporal_client()
|
|
189
|
+
org_id = organization["id"]
|
|
190
|
+
task_queue = f"{org_id}.{runner_name}"
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
# Describe the task queue
|
|
194
|
+
desc = await temporal_client.describe_task_queue(
|
|
195
|
+
task_queue=task_queue,
|
|
196
|
+
task_queue_type=1, # TaskQueueType.WORKFLOW
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
workers = []
|
|
200
|
+
approximate_backlog = None
|
|
201
|
+
|
|
202
|
+
# Extract worker information
|
|
203
|
+
if desc.pollers:
|
|
204
|
+
for poller in desc.pollers:
|
|
205
|
+
worker_info = WorkerInfo(
|
|
206
|
+
identity=poller.identity,
|
|
207
|
+
last_access_time=poller.last_access_time.isoformat() if poller.last_access_time else None,
|
|
208
|
+
rate_per_second=poller.rate_per_second if hasattr(poller, 'rate_per_second') else None,
|
|
209
|
+
)
|
|
210
|
+
workers.append(worker_info)
|
|
211
|
+
|
|
212
|
+
if hasattr(desc, 'approximate_backlog_count'):
|
|
213
|
+
approximate_backlog = desc.approximate_backlog_count
|
|
214
|
+
|
|
215
|
+
task_queue_info = TaskQueueInfo(
|
|
216
|
+
task_queue=task_queue,
|
|
217
|
+
organization_id=org_id,
|
|
218
|
+
runner_name=runner_name,
|
|
219
|
+
workers=workers,
|
|
220
|
+
worker_count=len(workers),
|
|
221
|
+
approximate_backlog_count=approximate_backlog,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
logger.info(
|
|
225
|
+
"workers_fetched_for_runner",
|
|
226
|
+
runner_name=runner_name,
|
|
227
|
+
worker_count=len(workers),
|
|
228
|
+
org_id=org_id,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
return task_queue_info
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
logger.warning(
|
|
235
|
+
"task_queue_not_found",
|
|
236
|
+
task_queue=task_queue,
|
|
237
|
+
error=str(e),
|
|
238
|
+
org_id=org_id,
|
|
239
|
+
)
|
|
240
|
+
# Return empty worker info if task queue doesn't exist
|
|
241
|
+
return TaskQueueInfo(
|
|
242
|
+
task_queue=task_queue,
|
|
243
|
+
organization_id=org_id,
|
|
244
|
+
runner_name=runner_name,
|
|
245
|
+
workers=[],
|
|
246
|
+
worker_count=0,
|
|
247
|
+
approximate_backlog_count=None,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.error(
|
|
252
|
+
"workers_fetch_failed",
|
|
253
|
+
error=str(e),
|
|
254
|
+
runner_name=runner_name,
|
|
255
|
+
org_id=organization["id"]
|
|
256
|
+
)
|
|
257
|
+
raise HTTPException(
|
|
258
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
259
|
+
detail=f"Failed to fetch workers: {str(e)}"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# Worker Registration for Decoupled Architecture
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class WorkerRegistrationRequest(BaseModel):
|
|
267
|
+
"""Worker registration request"""
|
|
268
|
+
environment_name: str # Task queue / environment name worker wants to join
|
|
269
|
+
hostname: Optional[str] = None
|
|
270
|
+
worker_metadata: Dict[str, Any] = {}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class WorkerRegistrationResponse(BaseModel):
|
|
274
|
+
"""Worker registration response with all config needed"""
|
|
275
|
+
worker_id: str # Unique worker ID
|
|
276
|
+
worker_token: str # Token for this worker (from environment)
|
|
277
|
+
environment_name: str # Task queue name (format: org_id.environment)
|
|
278
|
+
temporal_namespace: str
|
|
279
|
+
temporal_host: str
|
|
280
|
+
temporal_api_key: str
|
|
281
|
+
organization_id: str
|
|
282
|
+
control_plane_url: str
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
class WorkerHeartbeatRequest(BaseModel):
|
|
286
|
+
"""Worker heartbeat request"""
|
|
287
|
+
worker_id: str
|
|
288
|
+
environment_name: str
|
|
289
|
+
status: str = "active" # active, idle, busy
|
|
290
|
+
tasks_processed: int = 0
|
|
291
|
+
current_task_id: Optional[str] = None
|
|
292
|
+
worker_metadata: Dict[str, Any] = {}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
@router.post("/register", response_model=WorkerRegistrationResponse)
|
|
296
|
+
@instrument_endpoint("workers.register_worker")
|
|
297
|
+
async def register_worker(
|
|
298
|
+
registration: WorkerRegistrationRequest,
|
|
299
|
+
request: Request,
|
|
300
|
+
organization: dict = Depends(get_current_organization),
|
|
301
|
+
db: Session = Depends(get_db),
|
|
302
|
+
):
|
|
303
|
+
"""
|
|
304
|
+
Register a new worker with the control plane.
|
|
305
|
+
|
|
306
|
+
This endpoint is called by workers on startup to get their configuration.
|
|
307
|
+
The worker authenticates using KUBIYA_API_KEY (same auth as other API calls).
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
All configuration needed for worker to connect to Temporal and operate:
|
|
311
|
+
- worker_id: Unique ID for this worker instance
|
|
312
|
+
- worker_token: Environment's worker token
|
|
313
|
+
- environment_name: Formatted task queue name (org_id.environment)
|
|
314
|
+
- temporal_namespace, temporal_host, temporal_api_key: Temporal Cloud config
|
|
315
|
+
- organization_id: Organization ID
|
|
316
|
+
- control_plane_url: URL to send heartbeats
|
|
317
|
+
"""
|
|
318
|
+
try:
|
|
319
|
+
org_id = organization["id"]
|
|
320
|
+
|
|
321
|
+
# Look up the environment by name
|
|
322
|
+
environment = db.query(Environment).filter(
|
|
323
|
+
Environment.organization_id == org_id,
|
|
324
|
+
Environment.name == registration.environment_name
|
|
325
|
+
).first()
|
|
326
|
+
|
|
327
|
+
# If environment doesn't exist, create it
|
|
328
|
+
if not environment:
|
|
329
|
+
logger.info(
|
|
330
|
+
"creating_environment_for_worker",
|
|
331
|
+
environment_name=registration.environment_name,
|
|
332
|
+
org_id=org_id,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Generate worker token for this environment (UUID format)
|
|
336
|
+
worker_token = uuid.uuid4()
|
|
337
|
+
|
|
338
|
+
# Create the environment
|
|
339
|
+
environment = Environment(
|
|
340
|
+
id=uuid.uuid4(),
|
|
341
|
+
organization_id=org_id,
|
|
342
|
+
name=registration.environment_name,
|
|
343
|
+
worker_token=worker_token,
|
|
344
|
+
status="active", # Mark as active immediately
|
|
345
|
+
created_at=datetime.now(timezone.utc),
|
|
346
|
+
updated_at=datetime.now(timezone.utc),
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
db.add(environment)
|
|
350
|
+
db.commit()
|
|
351
|
+
db.refresh(environment)
|
|
352
|
+
|
|
353
|
+
logger.info(
|
|
354
|
+
"environment_created_for_worker",
|
|
355
|
+
environment_name=registration.environment_name,
|
|
356
|
+
environment_id=str(environment.id),
|
|
357
|
+
org_id=org_id,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Check if environment is ready
|
|
361
|
+
if environment.status not in ["ready", "active"]:
|
|
362
|
+
raise HTTPException(
|
|
363
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
364
|
+
detail=f"Environment is not ready (status: {environment.status}). "
|
|
365
|
+
f"Please wait for provisioning to complete."
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
# Get organization-specific Temporal credentials
|
|
369
|
+
import os
|
|
370
|
+
from control_plane_api.app.lib.temporal_credentials_service import (
|
|
371
|
+
get_temporal_credentials_for_org,
|
|
372
|
+
is_local_temporal
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
token = request.state.kubiya_token
|
|
376
|
+
|
|
377
|
+
# Check if local Temporal (for development)
|
|
378
|
+
if is_local_temporal():
|
|
379
|
+
logger.info("using_local_temporal_config", org_id=org_id)
|
|
380
|
+
temporal_credentials = {
|
|
381
|
+
"namespace": os.getenv("TEMPORAL_NAMESPACE", "default"),
|
|
382
|
+
"api_key": "",
|
|
383
|
+
"host": os.getenv("TEMPORAL_HOST", "localhost:7233"),
|
|
384
|
+
"org": org_id,
|
|
385
|
+
}
|
|
386
|
+
else:
|
|
387
|
+
# Fetch org-specific credentials from Kubiya API
|
|
388
|
+
try:
|
|
389
|
+
temporal_credentials = await get_temporal_credentials_for_org(
|
|
390
|
+
org_id=org_id,
|
|
391
|
+
token=token,
|
|
392
|
+
use_fallback=True # Enable fallback during migration
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
logger.info(
|
|
396
|
+
"temporal_credentials_fetched_for_worker",
|
|
397
|
+
org_id=org_id,
|
|
398
|
+
namespace=temporal_credentials["namespace"],
|
|
399
|
+
source="kubiya_api"
|
|
400
|
+
)
|
|
401
|
+
except Exception as e:
|
|
402
|
+
logger.error(
|
|
403
|
+
"temporal_credentials_fetch_failed",
|
|
404
|
+
org_id=org_id,
|
|
405
|
+
error=str(e)
|
|
406
|
+
)
|
|
407
|
+
raise HTTPException(
|
|
408
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
409
|
+
detail="Failed to fetch Temporal credentials. Please contact support."
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# For backwards compatibility with existing code
|
|
413
|
+
namespace = {
|
|
414
|
+
"namespace_name": temporal_credentials["namespace"],
|
|
415
|
+
"api_key_encrypted": temporal_credentials["api_key"],
|
|
416
|
+
"status": "ready"
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
logger.info(
|
|
420
|
+
"using_org_specific_namespace",
|
|
421
|
+
namespace_name=namespace["namespace_name"],
|
|
422
|
+
org_id=org_id,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Generate worker ID
|
|
426
|
+
worker_id = uuid.uuid4()
|
|
427
|
+
|
|
428
|
+
# Create worker record in database
|
|
429
|
+
worker_heartbeat = WorkerHeartbeat(
|
|
430
|
+
id=worker_id,
|
|
431
|
+
worker_id=str(worker_id), # Also set worker_id (has NOT NULL constraint)
|
|
432
|
+
organization_id=org_id,
|
|
433
|
+
environment_name=registration.environment_name,
|
|
434
|
+
worker_token=environment.worker_token,
|
|
435
|
+
hostname=registration.hostname,
|
|
436
|
+
worker_metadata=registration.worker_metadata,
|
|
437
|
+
status="active",
|
|
438
|
+
tasks_processed=0,
|
|
439
|
+
registered_at=datetime.now(timezone.utc),
|
|
440
|
+
last_heartbeat=datetime.now(timezone.utc),
|
|
441
|
+
updated_at=datetime.now(timezone.utc),
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
db.add(worker_heartbeat)
|
|
445
|
+
db.commit()
|
|
446
|
+
db.refresh(worker_heartbeat)
|
|
447
|
+
|
|
448
|
+
# Format task queue name: org_id.environment_name
|
|
449
|
+
task_queue_name = f"{org_id}.{registration.environment_name}"
|
|
450
|
+
|
|
451
|
+
# Get Temporal Cloud configuration
|
|
452
|
+
import os
|
|
453
|
+
temporal_host = os.getenv("TEMPORAL_HOST", "us-east-1.aws.api.temporal.io:7233")
|
|
454
|
+
|
|
455
|
+
# Decrypt API key from namespace (TODO: implement proper decryption)
|
|
456
|
+
temporal_api_key = namespace.get("api_key_encrypted", "")
|
|
457
|
+
|
|
458
|
+
# Get control plane URL from environment or construct from request
|
|
459
|
+
control_plane_url = os.getenv("CONTROL_PLANE_URL")
|
|
460
|
+
if not control_plane_url:
|
|
461
|
+
# Construct from request if not set
|
|
462
|
+
control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
|
|
463
|
+
|
|
464
|
+
logger.info(
|
|
465
|
+
"worker_registered",
|
|
466
|
+
worker_id=str(worker_id),
|
|
467
|
+
environment_name=registration.environment_name,
|
|
468
|
+
task_queue=task_queue_name,
|
|
469
|
+
org_id=org_id,
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
return WorkerRegistrationResponse(
|
|
473
|
+
worker_id=str(worker_id),
|
|
474
|
+
worker_token=str(environment.worker_token),
|
|
475
|
+
environment_name=task_queue_name, # Return formatted name
|
|
476
|
+
temporal_namespace=namespace.get("namespace_name"),
|
|
477
|
+
temporal_host=temporal_host,
|
|
478
|
+
temporal_api_key=temporal_api_key,
|
|
479
|
+
organization_id=org_id,
|
|
480
|
+
control_plane_url=control_plane_url,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
except HTTPException:
|
|
484
|
+
raise
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.error(
|
|
487
|
+
"worker_registration_failed",
|
|
488
|
+
error=str(e),
|
|
489
|
+
environment_name=registration.environment_name,
|
|
490
|
+
org_id=organization["id"]
|
|
491
|
+
)
|
|
492
|
+
raise HTTPException(
|
|
493
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
494
|
+
detail=f"Failed to register worker: {str(e)}"
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
@router.post("/heartbeat", status_code=status.HTTP_204_NO_CONTENT)
|
|
499
|
+
@instrument_endpoint("workers.worker_heartbeat")
|
|
500
|
+
async def worker_heartbeat(
|
|
501
|
+
heartbeat: WorkerHeartbeatRequest,
|
|
502
|
+
request: Request,
|
|
503
|
+
organization: dict = Depends(get_current_organization),
|
|
504
|
+
):
|
|
505
|
+
"""
|
|
506
|
+
Receive heartbeat from a worker.
|
|
507
|
+
|
|
508
|
+
OPTIMIZATION: Uses Redis for scalable heartbeat storage instead of database.
|
|
509
|
+
Database writes are expensive and heartbeats happen every 30s per worker.
|
|
510
|
+
|
|
511
|
+
Workers should call this endpoint periodically (e.g., every 30 seconds) to:
|
|
512
|
+
- Confirm they're still alive
|
|
513
|
+
- Update their status (active, idle, busy)
|
|
514
|
+
- Report tasks processed
|
|
515
|
+
- Update metadata
|
|
516
|
+
"""
|
|
517
|
+
try:
|
|
518
|
+
org_id = organization["id"]
|
|
519
|
+
redis_client = get_redis_client()
|
|
520
|
+
|
|
521
|
+
if not redis_client:
|
|
522
|
+
# Redis not available - log warning but don't fail (graceful degradation)
|
|
523
|
+
logger.warning(
|
|
524
|
+
"worker_heartbeat_redis_unavailable",
|
|
525
|
+
worker_id=heartbeat.worker_id,
|
|
526
|
+
org_id=org_id,
|
|
527
|
+
)
|
|
528
|
+
return None
|
|
529
|
+
|
|
530
|
+
# Build heartbeat data for Redis
|
|
531
|
+
heartbeat_data = {
|
|
532
|
+
"worker_id": heartbeat.worker_id,
|
|
533
|
+
"organization_id": org_id,
|
|
534
|
+
"environment_name": heartbeat.environment_name,
|
|
535
|
+
"status": heartbeat.status,
|
|
536
|
+
"tasks_processed": heartbeat.tasks_processed,
|
|
537
|
+
"current_task_id": heartbeat.current_task_id,
|
|
538
|
+
"last_heartbeat": datetime.now(timezone.utc).isoformat(),
|
|
539
|
+
"metadata": heartbeat.worker_metadata,
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
# Store in Redis with 5-minute TTL (if worker crashes, heartbeat expires)
|
|
543
|
+
redis_key = f"worker:{heartbeat.worker_id}:heartbeat"
|
|
544
|
+
await redis_client.set(redis_key, json.dumps(heartbeat_data), ex=300)
|
|
545
|
+
|
|
546
|
+
logger.debug(
|
|
547
|
+
"worker_heartbeat_received",
|
|
548
|
+
worker_id=heartbeat.worker_id,
|
|
549
|
+
status=heartbeat.status,
|
|
550
|
+
environment_name=heartbeat.environment_name,
|
|
551
|
+
org_id=org_id,
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
return None
|
|
555
|
+
|
|
556
|
+
except Exception as e:
|
|
557
|
+
logger.error(
|
|
558
|
+
"worker_heartbeat_failed",
|
|
559
|
+
error=str(e),
|
|
560
|
+
worker_id=heartbeat.worker_id,
|
|
561
|
+
org_id=organization["id"]
|
|
562
|
+
)
|
|
563
|
+
# Don't fail the worker if heartbeat fails - graceful degradation
|
|
564
|
+
return None
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
# Worker ID-based endpoints (new architecture)
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
class WorkerStartRequest(BaseModel):
|
|
571
|
+
"""Request to start a worker and fetch its config"""
|
|
572
|
+
system_info: Dict[str, Any] = {}
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
class WorkerConfigResponse(BaseModel):
|
|
576
|
+
"""Worker configuration response"""
|
|
577
|
+
worker_id: str
|
|
578
|
+
worker_queue_name: str
|
|
579
|
+
environment_name: str
|
|
580
|
+
task_queue_name: str # Full: org.env.worker_queue
|
|
581
|
+
temporal_namespace: str
|
|
582
|
+
temporal_host: str
|
|
583
|
+
temporal_api_key: str
|
|
584
|
+
organization_id: str
|
|
585
|
+
control_plane_url: str
|
|
586
|
+
heartbeat_interval: int = 60
|
|
587
|
+
# LiteLLM configuration
|
|
588
|
+
litellm_api_url: str
|
|
589
|
+
litellm_api_key: str
|
|
590
|
+
# OpenTelemetry (OTEL) configuration for distributed tracing
|
|
591
|
+
otel_enabled: bool = True
|
|
592
|
+
otel_exporter_otlp_endpoint: Optional[str] = None
|
|
593
|
+
otel_service_name: str = "agent-control-plane-worker"
|
|
594
|
+
otel_traces_sampler: str = "parentbased_always_on"
|
|
595
|
+
otel_traces_sampler_arg: Optional[float] = None
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
class WorkerSystemInfo(BaseModel):
|
|
599
|
+
"""Worker system information"""
|
|
600
|
+
hostname: Optional[str] = None
|
|
601
|
+
platform: Optional[str] = None
|
|
602
|
+
os_name: Optional[str] = None
|
|
603
|
+
os_version: Optional[str] = None
|
|
604
|
+
python_version: Optional[str] = None
|
|
605
|
+
cli_version: Optional[str] = None
|
|
606
|
+
sdk_version: Optional[str] = None # Worker SDK version
|
|
607
|
+
pid: Optional[int] = None # Process ID
|
|
608
|
+
cwd: Optional[str] = None # Current working directory
|
|
609
|
+
supported_runtimes: Optional[List[str]] = None # Available runtimes (e.g., ["agno", "claude_code"])
|
|
610
|
+
llm_gateway_url: Optional[str] = None # LiteLLM/LLM gateway URL
|
|
611
|
+
docker_available: Optional[bool] = None
|
|
612
|
+
docker_version: Optional[str] = None
|
|
613
|
+
cpu_count: Optional[int] = None
|
|
614
|
+
cpu_percent: Optional[float] = None
|
|
615
|
+
memory_total: Optional[int] = None # bytes
|
|
616
|
+
memory_used: Optional[int] = None # bytes
|
|
617
|
+
memory_percent: Optional[float] = None
|
|
618
|
+
disk_total: Optional[int] = None # bytes
|
|
619
|
+
disk_used: Optional[int] = None # bytes
|
|
620
|
+
disk_percent: Optional[float] = None
|
|
621
|
+
uptime_seconds: Optional[float] = None
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
class WorkerHeartbeatSimple(BaseModel):
|
|
625
|
+
"""Simplified heartbeat request (worker_id in URL)"""
|
|
626
|
+
status: str = "active"
|
|
627
|
+
tasks_processed: int = 0
|
|
628
|
+
current_task_id: Optional[str] = None
|
|
629
|
+
worker_metadata: Dict[str, Any] = {}
|
|
630
|
+
system_info: Optional[WorkerSystemInfo] = None
|
|
631
|
+
logs: Optional[List[str]] = None # Recent log lines since last heartbeat
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
@router.post("/{worker_id}/start", response_model=WorkerConfigResponse)
|
|
635
|
+
@instrument_endpoint("workers.start_worker")
|
|
636
|
+
async def start_worker(
|
|
637
|
+
worker_id: str,
|
|
638
|
+
start_request: WorkerStartRequest,
|
|
639
|
+
request: Request,
|
|
640
|
+
organization: dict = Depends(get_current_organization),
|
|
641
|
+
db: Session = Depends(get_db),
|
|
642
|
+
):
|
|
643
|
+
"""
|
|
644
|
+
Start a worker and fetch its configuration.
|
|
645
|
+
|
|
646
|
+
This endpoint is called by workers on startup with just worker_id and API key.
|
|
647
|
+
It returns all necessary configuration for the worker to connect to Temporal.
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
worker_id: Worker ID (UUID created in UI)
|
|
651
|
+
start_request: System information from worker
|
|
652
|
+
|
|
653
|
+
Returns:
|
|
654
|
+
Complete worker configuration including Temporal credentials
|
|
655
|
+
"""
|
|
656
|
+
try:
|
|
657
|
+
org_id = organization["id"]
|
|
658
|
+
|
|
659
|
+
# Look up worker in database with eager loading
|
|
660
|
+
worker = db.query(WorkerHeartbeat).options(
|
|
661
|
+
joinedload(WorkerHeartbeat.worker_queue).joinedload(WorkerQueue.environment)
|
|
662
|
+
).filter(
|
|
663
|
+
WorkerHeartbeat.id == worker_id,
|
|
664
|
+
WorkerHeartbeat.organization_id == org_id
|
|
665
|
+
).first()
|
|
666
|
+
|
|
667
|
+
if not worker:
|
|
668
|
+
raise HTTPException(
|
|
669
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
670
|
+
detail=f"Worker '{worker_id}' not found"
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
# Get worker queue separately
|
|
674
|
+
if not worker.worker_queue_id:
|
|
675
|
+
raise HTTPException(
|
|
676
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
677
|
+
detail=f"Worker has no queue assigned"
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
worker_queue = db.query(WorkerQueue).filter(
|
|
681
|
+
WorkerQueue.id == worker.worker_queue_id,
|
|
682
|
+
WorkerQueue.organization_id == org_id
|
|
683
|
+
).first()
|
|
684
|
+
|
|
685
|
+
if not worker_queue:
|
|
686
|
+
raise HTTPException(
|
|
687
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
688
|
+
detail=f"Worker queue not found"
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
worker_queue_name = worker_queue.name
|
|
692
|
+
|
|
693
|
+
# Get environment separately
|
|
694
|
+
environment_name = "default"
|
|
695
|
+
if worker_queue.environment_id:
|
|
696
|
+
environment = db.query(Environment).filter(
|
|
697
|
+
Environment.id == worker_queue.environment_id,
|
|
698
|
+
Environment.organization_id == org_id
|
|
699
|
+
).first()
|
|
700
|
+
if environment:
|
|
701
|
+
environment_name = environment.name
|
|
702
|
+
|
|
703
|
+
# TEMPORARY: Skip database lookup and use fixed namespace + admin API key
|
|
704
|
+
import os
|
|
705
|
+
|
|
706
|
+
# Use fixed namespace for testing
|
|
707
|
+
namespace = {
|
|
708
|
+
"namespace_name": "agent-control-plane.lpagu",
|
|
709
|
+
"api_key_encrypted": os.getenv("TEMPORAL_CLOUD_ADMIN_TOKEN", ""),
|
|
710
|
+
"status": "ready"
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
logger.info(
|
|
714
|
+
"using_fixed_namespace_for_testing",
|
|
715
|
+
namespace_name=namespace["namespace_name"],
|
|
716
|
+
worker_id=worker_id,
|
|
717
|
+
org_id=org_id,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
# Update worker with system info and mark as starting
|
|
721
|
+
current_metadata = worker.worker_metadata or {}
|
|
722
|
+
worker.worker_metadata = {
|
|
723
|
+
**current_metadata,
|
|
724
|
+
**start_request.system_info,
|
|
725
|
+
"last_start": datetime.now(timezone.utc).isoformat(),
|
|
726
|
+
}
|
|
727
|
+
worker.status = "active"
|
|
728
|
+
worker.last_heartbeat = datetime.now(timezone.utc)
|
|
729
|
+
worker.updated_at = datetime.now(timezone.utc)
|
|
730
|
+
|
|
731
|
+
db.commit()
|
|
732
|
+
db.refresh(worker)
|
|
733
|
+
|
|
734
|
+
# Build full task queue name
|
|
735
|
+
task_queue_name = f"{org_id}.{environment_name}.{worker_queue_name}"
|
|
736
|
+
|
|
737
|
+
# Get Temporal Cloud configuration
|
|
738
|
+
import os
|
|
739
|
+
temporal_host = os.getenv("TEMPORAL_HOST", "us-east-1.aws.api.temporal.io:7233")
|
|
740
|
+
temporal_api_key = namespace.get("api_key_encrypted", "")
|
|
741
|
+
|
|
742
|
+
# Get control plane URL
|
|
743
|
+
control_plane_url = os.getenv("CONTROL_PLANE_URL")
|
|
744
|
+
if not control_plane_url:
|
|
745
|
+
control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
|
|
746
|
+
|
|
747
|
+
# Get LiteLLM configuration from environment
|
|
748
|
+
litellm_api_url = os.getenv("LITELLM_API_URL", "https://api.openai.com/v1")
|
|
749
|
+
litellm_api_key = os.getenv("LITELLM_API_KEY", "")
|
|
750
|
+
|
|
751
|
+
logger.info(
|
|
752
|
+
"worker_config_fetched",
|
|
753
|
+
worker_id=worker_id,
|
|
754
|
+
task_queue=task_queue_name,
|
|
755
|
+
environment=environment_name,
|
|
756
|
+
worker_queue=worker_queue_name,
|
|
757
|
+
org_id=org_id,
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
# Get OTEL configuration from settings (centralized configuration)
|
|
761
|
+
from control_plane_api.app.config import settings as app_settings
|
|
762
|
+
|
|
763
|
+
return WorkerConfigResponse(
|
|
764
|
+
worker_id=worker_id,
|
|
765
|
+
worker_queue_name=worker_queue_name,
|
|
766
|
+
environment_name=environment_name,
|
|
767
|
+
task_queue_name=task_queue_name,
|
|
768
|
+
temporal_namespace=namespace.get("namespace_name"),
|
|
769
|
+
temporal_host=temporal_host,
|
|
770
|
+
temporal_api_key=temporal_api_key,
|
|
771
|
+
organization_id=org_id,
|
|
772
|
+
control_plane_url=control_plane_url,
|
|
773
|
+
heartbeat_interval=worker_queue.heartbeat_interval or 60,
|
|
774
|
+
litellm_api_url=litellm_api_url,
|
|
775
|
+
litellm_api_key=litellm_api_key,
|
|
776
|
+
# Pass OTEL configuration to worker (centralized config)
|
|
777
|
+
otel_enabled=app_settings.OTEL_ENABLED,
|
|
778
|
+
otel_exporter_otlp_endpoint=app_settings.OTEL_EXPORTER_OTLP_ENDPOINT,
|
|
779
|
+
otel_service_name="agent-control-plane-worker",
|
|
780
|
+
otel_traces_sampler=app_settings.OTEL_TRACES_SAMPLER,
|
|
781
|
+
otel_traces_sampler_arg=app_settings.OTEL_TRACES_SAMPLER_ARG,
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
except HTTPException:
|
|
785
|
+
raise
|
|
786
|
+
except Exception as e:
|
|
787
|
+
logger.error(
|
|
788
|
+
"worker_start_failed",
|
|
789
|
+
error=str(e),
|
|
790
|
+
worker_id=worker_id,
|
|
791
|
+
org_id=organization.get("id")
|
|
792
|
+
)
|
|
793
|
+
raise HTTPException(
|
|
794
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
795
|
+
detail=f"Failed to start worker: {str(e)}"
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
@router.post("/{worker_id}/heartbeat", status_code=status.HTTP_204_NO_CONTENT)
|
|
800
|
+
@instrument_endpoint("workers.worker_heartbeat_simple")
|
|
801
|
+
async def worker_heartbeat_simple(
|
|
802
|
+
worker_id: str,
|
|
803
|
+
heartbeat: WorkerHeartbeatSimple,
|
|
804
|
+
request: Request,
|
|
805
|
+
organization: dict = Depends(get_current_organization),
|
|
806
|
+
):
|
|
807
|
+
"""
|
|
808
|
+
Receive heartbeat from a worker (simplified version with worker_id in URL).
|
|
809
|
+
|
|
810
|
+
OPTIMIZATION: Uses Redis for scalable heartbeat storage instead of database.
|
|
811
|
+
Database writes are expensive and heartbeats happen every 30s per worker.
|
|
812
|
+
Redis provides sub-millisecond writes and automatic TTL expiration.
|
|
813
|
+
|
|
814
|
+
Args:
|
|
815
|
+
worker_id: Worker ID (UUID)
|
|
816
|
+
heartbeat: Heartbeat data
|
|
817
|
+
"""
|
|
818
|
+
try:
|
|
819
|
+
org_id = organization["id"]
|
|
820
|
+
redis_client = get_redis_client()
|
|
821
|
+
|
|
822
|
+
if not redis_client:
|
|
823
|
+
# Redis not available - log warning but don't fail (graceful degradation)
|
|
824
|
+
logger.warning(
|
|
825
|
+
"worker_heartbeat_redis_unavailable",
|
|
826
|
+
worker_id=worker_id,
|
|
827
|
+
org_id=org_id,
|
|
828
|
+
)
|
|
829
|
+
return None
|
|
830
|
+
|
|
831
|
+
# Build heartbeat data for Redis
|
|
832
|
+
heartbeat_data = {
|
|
833
|
+
"worker_id": worker_id,
|
|
834
|
+
"organization_id": org_id,
|
|
835
|
+
"status": heartbeat.status,
|
|
836
|
+
"tasks_processed": heartbeat.tasks_processed,
|
|
837
|
+
"current_task_id": heartbeat.current_task_id,
|
|
838
|
+
"last_heartbeat": datetime.now(timezone.utc).isoformat(),
|
|
839
|
+
"metadata": heartbeat.worker_metadata,
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
# Get existing heartbeat data from Redis (for merging)
|
|
843
|
+
redis_key = f"worker:{worker_id}:heartbeat"
|
|
844
|
+
existing_heartbeat = None
|
|
845
|
+
try:
|
|
846
|
+
existing_data = await redis_client.get(redis_key)
|
|
847
|
+
if existing_data:
|
|
848
|
+
existing_heartbeat = json.loads(existing_data)
|
|
849
|
+
except Exception as e:
|
|
850
|
+
logger.warning("heartbeat_redis_get_failed", error=str(e))
|
|
851
|
+
|
|
852
|
+
# Handle system_info - preserve from last full heartbeat if not provided (lightweight mode)
|
|
853
|
+
if heartbeat.system_info:
|
|
854
|
+
# Full heartbeat - update system info
|
|
855
|
+
heartbeat_data["system_info"] = heartbeat.system_info.dict(exclude_none=True)
|
|
856
|
+
elif existing_heartbeat and "system_info" in existing_heartbeat:
|
|
857
|
+
# Lightweight heartbeat - preserve existing system info
|
|
858
|
+
heartbeat_data["system_info"] = existing_heartbeat["system_info"]
|
|
859
|
+
|
|
860
|
+
# Handle logs - fetch from Redis and append new logs
|
|
861
|
+
if heartbeat.logs:
|
|
862
|
+
try:
|
|
863
|
+
if existing_heartbeat:
|
|
864
|
+
existing_logs = existing_heartbeat.get("logs", [])
|
|
865
|
+
all_logs = existing_logs + heartbeat.logs
|
|
866
|
+
heartbeat_data["logs"] = all_logs[-100:] # Keep last 100 lines
|
|
867
|
+
else:
|
|
868
|
+
heartbeat_data["logs"] = heartbeat.logs[-100:]
|
|
869
|
+
except Exception as log_error:
|
|
870
|
+
logger.warning("heartbeat_log_merge_failed", error=str(log_error))
|
|
871
|
+
heartbeat_data["logs"] = heartbeat.logs[-100:]
|
|
872
|
+
elif existing_heartbeat and "logs" in existing_heartbeat:
|
|
873
|
+
# Preserve existing logs if no new logs provided
|
|
874
|
+
heartbeat_data["logs"] = existing_heartbeat["logs"]
|
|
875
|
+
|
|
876
|
+
# Store in Redis with 5-minute TTL (if worker crashes, heartbeat expires)
|
|
877
|
+
# TTL is 5x the heartbeat interval (60s * 5 = 300s) for safety
|
|
878
|
+
await redis_client.set(redis_key, json.dumps(heartbeat_data), ex=300)
|
|
879
|
+
|
|
880
|
+
logger.debug(
|
|
881
|
+
"worker_heartbeat_received",
|
|
882
|
+
worker_id=worker_id,
|
|
883
|
+
status=heartbeat.status,
|
|
884
|
+
org_id=org_id,
|
|
885
|
+
)
|
|
886
|
+
|
|
887
|
+
return None
|
|
888
|
+
|
|
889
|
+
except Exception as e:
|
|
890
|
+
logger.error(
|
|
891
|
+
"worker_heartbeat_failed",
|
|
892
|
+
error=str(e),
|
|
893
|
+
worker_id=worker_id,
|
|
894
|
+
org_id=organization.get("id")
|
|
895
|
+
)
|
|
896
|
+
# Don't fail the worker if heartbeat fails - graceful degradation
|
|
897
|
+
return None
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
class WorkerDisconnectRequest(BaseModel):
|
|
901
|
+
"""Worker disconnect request"""
|
|
902
|
+
reason: str = "shutdown" # shutdown, error, crash, etc.
|
|
903
|
+
exit_code: Optional[int] = None
|
|
904
|
+
error_message: Optional[str] = None
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
@router.post("/{worker_id}/disconnect", status_code=status.HTTP_204_NO_CONTENT)
|
|
908
|
+
@instrument_endpoint("workers.worker_disconnect")
|
|
909
|
+
async def worker_disconnect(
|
|
910
|
+
worker_id: str,
|
|
911
|
+
disconnect: WorkerDisconnectRequest,
|
|
912
|
+
request: Request,
|
|
913
|
+
organization: dict = Depends(get_current_organization),
|
|
914
|
+
db: Session = Depends(get_db),
|
|
915
|
+
):
|
|
916
|
+
"""
|
|
917
|
+
Mark a worker as disconnected/offline.
|
|
918
|
+
|
|
919
|
+
This endpoint is called by workers when they:
|
|
920
|
+
- Shut down gracefully (Ctrl+C)
|
|
921
|
+
- Exit due to an error
|
|
922
|
+
- Crash unexpectedly (via atexit handler)
|
|
923
|
+
|
|
924
|
+
Args:
|
|
925
|
+
worker_id: Worker ID (UUID)
|
|
926
|
+
disconnect: Disconnect details (reason, exit code, error)
|
|
927
|
+
"""
|
|
928
|
+
try:
|
|
929
|
+
org_id = organization["id"]
|
|
930
|
+
|
|
931
|
+
# Look up worker in database
|
|
932
|
+
worker = db.query(WorkerHeartbeat).filter(
|
|
933
|
+
WorkerHeartbeat.id == worker_id,
|
|
934
|
+
WorkerHeartbeat.organization_id == org_id
|
|
935
|
+
).first()
|
|
936
|
+
|
|
937
|
+
if not worker:
|
|
938
|
+
logger.warning(
|
|
939
|
+
"worker_disconnect_not_found",
|
|
940
|
+
worker_id=worker_id,
|
|
941
|
+
org_id=org_id,
|
|
942
|
+
)
|
|
943
|
+
raise HTTPException(
|
|
944
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
945
|
+
detail="Worker not found"
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
# IMPORTANT: Delete from Redis FIRST for immediate effect
|
|
949
|
+
# This ensures workers are removed from active lists immediately
|
|
950
|
+
redis_client = get_redis_client()
|
|
951
|
+
if redis_client:
|
|
952
|
+
redis_key = f"worker:{worker_id}:heartbeat"
|
|
953
|
+
try:
|
|
954
|
+
# Delete the heartbeat key from Redis
|
|
955
|
+
await redis_client.delete(redis_key)
|
|
956
|
+
logger.info(
|
|
957
|
+
"worker_removed_from_redis",
|
|
958
|
+
worker_id=worker_id,
|
|
959
|
+
redis_key=redis_key
|
|
960
|
+
)
|
|
961
|
+
except Exception as redis_error:
|
|
962
|
+
# Log but don't fail the disconnect
|
|
963
|
+
logger.warning(
|
|
964
|
+
"redis_delete_failed",
|
|
965
|
+
error=str(redis_error),
|
|
966
|
+
worker_id=worker_id
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
# THEN update worker status to disconnected in database
|
|
970
|
+
worker.status = "disconnected"
|
|
971
|
+
worker.last_heartbeat = datetime.now(timezone.utc)
|
|
972
|
+
worker.worker_metadata = {
|
|
973
|
+
"disconnect_reason": disconnect.reason,
|
|
974
|
+
"disconnect_time": datetime.now(timezone.utc).isoformat(),
|
|
975
|
+
"exit_code": disconnect.exit_code,
|
|
976
|
+
"error_message": disconnect.error_message,
|
|
977
|
+
}
|
|
978
|
+
worker.updated_at = datetime.now(timezone.utc)
|
|
979
|
+
|
|
980
|
+
db.commit()
|
|
981
|
+
|
|
982
|
+
logger.info(
|
|
983
|
+
"worker_disconnected",
|
|
984
|
+
worker_id=worker_id,
|
|
985
|
+
reason=disconnect.reason,
|
|
986
|
+
exit_code=disconnect.exit_code,
|
|
987
|
+
org_id=org_id,
|
|
988
|
+
)
|
|
989
|
+
|
|
990
|
+
return None
|
|
991
|
+
|
|
992
|
+
except HTTPException:
|
|
993
|
+
raise
|
|
994
|
+
except Exception as e:
|
|
995
|
+
logger.error(
|
|
996
|
+
"worker_disconnect_failed",
|
|
997
|
+
error=str(e),
|
|
998
|
+
worker_id=worker_id,
|
|
999
|
+
org_id=organization.get("id")
|
|
1000
|
+
)
|
|
1001
|
+
raise HTTPException(
|
|
1002
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1003
|
+
detail=f"Failed to process disconnect: {str(e)}"
|
|
1004
|
+
)
|