kubiya-control-plane-api 0.9.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (479) hide show
  1. control_plane_api/LICENSE +676 -0
  2. control_plane_api/README.md +350 -0
  3. control_plane_api/__init__.py +4 -0
  4. control_plane_api/__version__.py +8 -0
  5. control_plane_api/alembic/README +1 -0
  6. control_plane_api/alembic/env.py +121 -0
  7. control_plane_api/alembic/script.py.mako +28 -0
  8. control_plane_api/alembic/versions/2613c65c3dbe_initial_database_setup.py +32 -0
  9. control_plane_api/alembic/versions/2df520d4927d_merge_heads.py +28 -0
  10. control_plane_api/alembic/versions/43abf98d6a01_add_paused_status_to_executions.py +73 -0
  11. control_plane_api/alembic/versions/6289854264cb_merge_multiple_heads.py +28 -0
  12. control_plane_api/alembic/versions/6a4d4dc3d8dc_generate_execution_transitions.py +50 -0
  13. control_plane_api/alembic/versions/87d11cf0a783_add_disconnected_status_to_worker_.py +44 -0
  14. control_plane_api/alembic/versions/add_ephemeral_queue_support.py +85 -0
  15. control_plane_api/alembic/versions/add_model_type_to_llm_models.py +31 -0
  16. control_plane_api/alembic/versions/add_plan_executions_table.py +114 -0
  17. control_plane_api/alembic/versions/add_trace_span_tables.py +154 -0
  18. control_plane_api/alembic/versions/add_user_info_to_traces.py +36 -0
  19. control_plane_api/alembic/versions/adjusting_foreign_keys.py +32 -0
  20. control_plane_api/alembic/versions/b4983d976db2_initial_tables.py +1128 -0
  21. control_plane_api/alembic/versions/d181a3b40e71_rename_custom_metadata_to_metadata_in_.py +50 -0
  22. control_plane_api/alembic/versions/df9117888e82_add_missing_columns.py +82 -0
  23. control_plane_api/alembic/versions/f25de6ad895a_missing_migrations.py +34 -0
  24. control_plane_api/alembic/versions/f71305fb69b9_fix_ephemeral_queue_deletion_foreign_key.py +54 -0
  25. control_plane_api/alembic/versions/mark_local_exec_queues_as_ephemeral.py +68 -0
  26. control_plane_api/alembic.ini +148 -0
  27. control_plane_api/api/index.py +12 -0
  28. control_plane_api/app/__init__.py +11 -0
  29. control_plane_api/app/activities/__init__.py +20 -0
  30. control_plane_api/app/activities/agent_activities.py +384 -0
  31. control_plane_api/app/activities/plan_generation_activities.py +499 -0
  32. control_plane_api/app/activities/team_activities.py +424 -0
  33. control_plane_api/app/activities/temporal_cloud_activities.py +588 -0
  34. control_plane_api/app/config/__init__.py +35 -0
  35. control_plane_api/app/config/api_config.py +469 -0
  36. control_plane_api/app/config/config_loader.py +224 -0
  37. control_plane_api/app/config/model_pricing.py +323 -0
  38. control_plane_api/app/config/storage_config.py +159 -0
  39. control_plane_api/app/config.py +115 -0
  40. control_plane_api/app/controllers/__init__.py +0 -0
  41. control_plane_api/app/controllers/execution_environment_controller.py +1315 -0
  42. control_plane_api/app/database.py +135 -0
  43. control_plane_api/app/exceptions.py +408 -0
  44. control_plane_api/app/lib/__init__.py +11 -0
  45. control_plane_api/app/lib/environment.py +65 -0
  46. control_plane_api/app/lib/event_bus/__init__.py +17 -0
  47. control_plane_api/app/lib/event_bus/base.py +136 -0
  48. control_plane_api/app/lib/event_bus/manager.py +335 -0
  49. control_plane_api/app/lib/event_bus/providers/__init__.py +6 -0
  50. control_plane_api/app/lib/event_bus/providers/http_provider.py +166 -0
  51. control_plane_api/app/lib/event_bus/providers/nats_provider.py +324 -0
  52. control_plane_api/app/lib/event_bus/providers/redis_provider.py +233 -0
  53. control_plane_api/app/lib/event_bus/providers/websocket_provider.py +497 -0
  54. control_plane_api/app/lib/job_executor.py +330 -0
  55. control_plane_api/app/lib/kubiya_client.py +293 -0
  56. control_plane_api/app/lib/litellm_pricing.py +166 -0
  57. control_plane_api/app/lib/mcp_validation.py +163 -0
  58. control_plane_api/app/lib/nats/__init__.py +13 -0
  59. control_plane_api/app/lib/nats/credentials_manager.py +288 -0
  60. control_plane_api/app/lib/nats/listener.py +374 -0
  61. control_plane_api/app/lib/planning_prompt_builder.py +153 -0
  62. control_plane_api/app/lib/planning_tools/__init__.py +41 -0
  63. control_plane_api/app/lib/planning_tools/agents.py +409 -0
  64. control_plane_api/app/lib/planning_tools/agno_toolkit.py +836 -0
  65. control_plane_api/app/lib/planning_tools/base.py +119 -0
  66. control_plane_api/app/lib/planning_tools/cognitive_memory_tools.py +403 -0
  67. control_plane_api/app/lib/planning_tools/context_graph_tools.py +545 -0
  68. control_plane_api/app/lib/planning_tools/environments.py +218 -0
  69. control_plane_api/app/lib/planning_tools/knowledge.py +204 -0
  70. control_plane_api/app/lib/planning_tools/models.py +93 -0
  71. control_plane_api/app/lib/planning_tools/planning_service.py +646 -0
  72. control_plane_api/app/lib/planning_tools/resources.py +242 -0
  73. control_plane_api/app/lib/planning_tools/teams.py +334 -0
  74. control_plane_api/app/lib/policy_enforcer_client.py +1016 -0
  75. control_plane_api/app/lib/redis_client.py +803 -0
  76. control_plane_api/app/lib/sqlalchemy_utils.py +486 -0
  77. control_plane_api/app/lib/state_transition_tools/__init__.py +7 -0
  78. control_plane_api/app/lib/state_transition_tools/execution_context.py +388 -0
  79. control_plane_api/app/lib/storage/__init__.py +20 -0
  80. control_plane_api/app/lib/storage/base_provider.py +274 -0
  81. control_plane_api/app/lib/storage/provider_factory.py +157 -0
  82. control_plane_api/app/lib/storage/vercel_blob_provider.py +468 -0
  83. control_plane_api/app/lib/supabase.py +71 -0
  84. control_plane_api/app/lib/supabase_utils.py +138 -0
  85. control_plane_api/app/lib/task_planning/__init__.py +138 -0
  86. control_plane_api/app/lib/task_planning/agent_factory.py +308 -0
  87. control_plane_api/app/lib/task_planning/agents.py +389 -0
  88. control_plane_api/app/lib/task_planning/cache.py +218 -0
  89. control_plane_api/app/lib/task_planning/entity_resolver.py +273 -0
  90. control_plane_api/app/lib/task_planning/helpers.py +293 -0
  91. control_plane_api/app/lib/task_planning/hooks.py +474 -0
  92. control_plane_api/app/lib/task_planning/models.py +503 -0
  93. control_plane_api/app/lib/task_planning/plan_validator.py +166 -0
  94. control_plane_api/app/lib/task_planning/planning_workflow.py +2911 -0
  95. control_plane_api/app/lib/task_planning/runner.py +656 -0
  96. control_plane_api/app/lib/task_planning/streaming_hook.py +213 -0
  97. control_plane_api/app/lib/task_planning/workflow.py +424 -0
  98. control_plane_api/app/lib/templating/__init__.py +88 -0
  99. control_plane_api/app/lib/templating/compiler.py +278 -0
  100. control_plane_api/app/lib/templating/engine.py +178 -0
  101. control_plane_api/app/lib/templating/parsers/__init__.py +29 -0
  102. control_plane_api/app/lib/templating/parsers/base.py +96 -0
  103. control_plane_api/app/lib/templating/parsers/env.py +85 -0
  104. control_plane_api/app/lib/templating/parsers/graph.py +112 -0
  105. control_plane_api/app/lib/templating/parsers/secret.py +87 -0
  106. control_plane_api/app/lib/templating/parsers/simple.py +81 -0
  107. control_plane_api/app/lib/templating/resolver.py +366 -0
  108. control_plane_api/app/lib/templating/types.py +214 -0
  109. control_plane_api/app/lib/templating/validator.py +201 -0
  110. control_plane_api/app/lib/temporal_client.py +232 -0
  111. control_plane_api/app/lib/temporal_credentials_cache.py +178 -0
  112. control_plane_api/app/lib/temporal_credentials_service.py +203 -0
  113. control_plane_api/app/lib/validation/__init__.py +24 -0
  114. control_plane_api/app/lib/validation/runtime_validation.py +388 -0
  115. control_plane_api/app/main.py +531 -0
  116. control_plane_api/app/middleware/__init__.py +10 -0
  117. control_plane_api/app/middleware/auth.py +645 -0
  118. control_plane_api/app/middleware/exception_handler.py +267 -0
  119. control_plane_api/app/middleware/prometheus_middleware.py +173 -0
  120. control_plane_api/app/middleware/rate_limiting.py +384 -0
  121. control_plane_api/app/middleware/request_id.py +202 -0
  122. control_plane_api/app/models/__init__.py +40 -0
  123. control_plane_api/app/models/agent.py +90 -0
  124. control_plane_api/app/models/analytics.py +206 -0
  125. control_plane_api/app/models/associations.py +107 -0
  126. control_plane_api/app/models/auth_user.py +73 -0
  127. control_plane_api/app/models/context.py +161 -0
  128. control_plane_api/app/models/custom_integration.py +99 -0
  129. control_plane_api/app/models/environment.py +64 -0
  130. control_plane_api/app/models/execution.py +125 -0
  131. control_plane_api/app/models/execution_transition.py +50 -0
  132. control_plane_api/app/models/job.py +159 -0
  133. control_plane_api/app/models/llm_model.py +78 -0
  134. control_plane_api/app/models/orchestration.py +66 -0
  135. control_plane_api/app/models/plan_execution.py +102 -0
  136. control_plane_api/app/models/presence.py +49 -0
  137. control_plane_api/app/models/project.py +61 -0
  138. control_plane_api/app/models/project_management.py +85 -0
  139. control_plane_api/app/models/session.py +29 -0
  140. control_plane_api/app/models/skill.py +155 -0
  141. control_plane_api/app/models/system_tables.py +43 -0
  142. control_plane_api/app/models/task_planning.py +372 -0
  143. control_plane_api/app/models/team.py +86 -0
  144. control_plane_api/app/models/trace.py +257 -0
  145. control_plane_api/app/models/user_profile.py +54 -0
  146. control_plane_api/app/models/worker.py +221 -0
  147. control_plane_api/app/models/workflow.py +161 -0
  148. control_plane_api/app/models/workspace.py +50 -0
  149. control_plane_api/app/observability/__init__.py +177 -0
  150. control_plane_api/app/observability/context_logging.py +475 -0
  151. control_plane_api/app/observability/decorators.py +337 -0
  152. control_plane_api/app/observability/local_span_processor.py +702 -0
  153. control_plane_api/app/observability/metrics.py +303 -0
  154. control_plane_api/app/observability/middleware.py +246 -0
  155. control_plane_api/app/observability/optional.py +115 -0
  156. control_plane_api/app/observability/tracing.py +382 -0
  157. control_plane_api/app/policies/README.md +149 -0
  158. control_plane_api/app/policies/approved_users.rego +62 -0
  159. control_plane_api/app/policies/business_hours.rego +51 -0
  160. control_plane_api/app/policies/rate_limiting.rego +100 -0
  161. control_plane_api/app/policies/tool_enforcement/README.md +336 -0
  162. control_plane_api/app/policies/tool_enforcement/bash_command_validation.rego +71 -0
  163. control_plane_api/app/policies/tool_enforcement/business_hours_enforcement.rego +82 -0
  164. control_plane_api/app/policies/tool_enforcement/mcp_tool_allowlist.rego +58 -0
  165. control_plane_api/app/policies/tool_enforcement/production_safeguards.rego +80 -0
  166. control_plane_api/app/policies/tool_enforcement/role_based_tool_access.rego +44 -0
  167. control_plane_api/app/policies/tool_restrictions.rego +86 -0
  168. control_plane_api/app/routers/__init__.py +4 -0
  169. control_plane_api/app/routers/agents.py +382 -0
  170. control_plane_api/app/routers/agents_v2.py +1598 -0
  171. control_plane_api/app/routers/analytics.py +1310 -0
  172. control_plane_api/app/routers/auth.py +59 -0
  173. control_plane_api/app/routers/client_config.py +57 -0
  174. control_plane_api/app/routers/context_graph.py +561 -0
  175. control_plane_api/app/routers/context_manager.py +577 -0
  176. control_plane_api/app/routers/custom_integrations.py +490 -0
  177. control_plane_api/app/routers/enforcer.py +132 -0
  178. control_plane_api/app/routers/environment_context.py +252 -0
  179. control_plane_api/app/routers/environments.py +761 -0
  180. control_plane_api/app/routers/execution_environment.py +847 -0
  181. control_plane_api/app/routers/executions/__init__.py +28 -0
  182. control_plane_api/app/routers/executions/router.py +286 -0
  183. control_plane_api/app/routers/executions/services/__init__.py +22 -0
  184. control_plane_api/app/routers/executions/services/demo_worker_health.py +156 -0
  185. control_plane_api/app/routers/executions/services/status_service.py +420 -0
  186. control_plane_api/app/routers/executions/services/test_worker_health.py +480 -0
  187. control_plane_api/app/routers/executions/services/worker_health.py +514 -0
  188. control_plane_api/app/routers/executions/streaming/__init__.py +22 -0
  189. control_plane_api/app/routers/executions/streaming/deduplication.py +352 -0
  190. control_plane_api/app/routers/executions/streaming/event_buffer.py +353 -0
  191. control_plane_api/app/routers/executions/streaming/event_formatter.py +964 -0
  192. control_plane_api/app/routers/executions/streaming/history_loader.py +588 -0
  193. control_plane_api/app/routers/executions/streaming/live_source.py +693 -0
  194. control_plane_api/app/routers/executions/streaming/streamer.py +849 -0
  195. control_plane_api/app/routers/executions.py +4888 -0
  196. control_plane_api/app/routers/health.py +165 -0
  197. control_plane_api/app/routers/health_v2.py +394 -0
  198. control_plane_api/app/routers/integration_templates.py +496 -0
  199. control_plane_api/app/routers/integrations.py +287 -0
  200. control_plane_api/app/routers/jobs.py +1809 -0
  201. control_plane_api/app/routers/metrics.py +517 -0
  202. control_plane_api/app/routers/models.py +82 -0
  203. control_plane_api/app/routers/models_v2.py +628 -0
  204. control_plane_api/app/routers/plan_executions.py +1481 -0
  205. control_plane_api/app/routers/plan_generation_async.py +304 -0
  206. control_plane_api/app/routers/policies.py +669 -0
  207. control_plane_api/app/routers/presence.py +234 -0
  208. control_plane_api/app/routers/projects.py +987 -0
  209. control_plane_api/app/routers/runners.py +379 -0
  210. control_plane_api/app/routers/runtimes.py +172 -0
  211. control_plane_api/app/routers/secrets.py +171 -0
  212. control_plane_api/app/routers/skills.py +1010 -0
  213. control_plane_api/app/routers/skills_definitions.py +140 -0
  214. control_plane_api/app/routers/storage.py +456 -0
  215. control_plane_api/app/routers/task_planning.py +611 -0
  216. control_plane_api/app/routers/task_queues.py +650 -0
  217. control_plane_api/app/routers/team_context.py +274 -0
  218. control_plane_api/app/routers/teams.py +1747 -0
  219. control_plane_api/app/routers/templates.py +248 -0
  220. control_plane_api/app/routers/traces.py +571 -0
  221. control_plane_api/app/routers/websocket_client.py +479 -0
  222. control_plane_api/app/routers/websocket_executions_status.py +437 -0
  223. control_plane_api/app/routers/websocket_gateway.py +323 -0
  224. control_plane_api/app/routers/websocket_traces.py +576 -0
  225. control_plane_api/app/routers/worker_queues.py +2555 -0
  226. control_plane_api/app/routers/worker_websocket.py +419 -0
  227. control_plane_api/app/routers/workers.py +1004 -0
  228. control_plane_api/app/routers/workflows.py +204 -0
  229. control_plane_api/app/runtimes/__init__.py +6 -0
  230. control_plane_api/app/runtimes/validation.py +344 -0
  231. control_plane_api/app/schemas/__init__.py +1 -0
  232. control_plane_api/app/schemas/job_schemas.py +302 -0
  233. control_plane_api/app/schemas/mcp_schemas.py +311 -0
  234. control_plane_api/app/schemas/template_schemas.py +133 -0
  235. control_plane_api/app/schemas/trace_schemas.py +168 -0
  236. control_plane_api/app/schemas/worker_queue_observability_schemas.py +165 -0
  237. control_plane_api/app/services/__init__.py +1 -0
  238. control_plane_api/app/services/agno_planning_strategy.py +233 -0
  239. control_plane_api/app/services/agno_service.py +838 -0
  240. control_plane_api/app/services/claude_code_planning_service.py +203 -0
  241. control_plane_api/app/services/context_graph_client.py +224 -0
  242. control_plane_api/app/services/custom_integration_service.py +415 -0
  243. control_plane_api/app/services/integration_resolution_service.py +345 -0
  244. control_plane_api/app/services/litellm_service.py +394 -0
  245. control_plane_api/app/services/plan_generator.py +79 -0
  246. control_plane_api/app/services/planning_strategy.py +66 -0
  247. control_plane_api/app/services/planning_strategy_factory.py +118 -0
  248. control_plane_api/app/services/policy_service.py +615 -0
  249. control_plane_api/app/services/state_transition_service.py +755 -0
  250. control_plane_api/app/services/storage_service.py +593 -0
  251. control_plane_api/app/services/temporal_cloud_provisioning.py +150 -0
  252. control_plane_api/app/services/toolsets/context_graph_skill.py +432 -0
  253. control_plane_api/app/services/trace_retention.py +354 -0
  254. control_plane_api/app/services/worker_queue_metrics_service.py +190 -0
  255. control_plane_api/app/services/workflow_cancellation_manager.py +135 -0
  256. control_plane_api/app/services/workflow_operations_service.py +611 -0
  257. control_plane_api/app/skills/__init__.py +100 -0
  258. control_plane_api/app/skills/base.py +239 -0
  259. control_plane_api/app/skills/builtin/__init__.py +37 -0
  260. control_plane_api/app/skills/builtin/agent_communication/__init__.py +8 -0
  261. control_plane_api/app/skills/builtin/agent_communication/skill.py +246 -0
  262. control_plane_api/app/skills/builtin/code_ingestion/__init__.py +4 -0
  263. control_plane_api/app/skills/builtin/code_ingestion/skill.py +267 -0
  264. control_plane_api/app/skills/builtin/cognitive_memory/__init__.py +4 -0
  265. control_plane_api/app/skills/builtin/cognitive_memory/skill.py +174 -0
  266. control_plane_api/app/skills/builtin/contextual_awareness/__init__.py +4 -0
  267. control_plane_api/app/skills/builtin/contextual_awareness/skill.py +387 -0
  268. control_plane_api/app/skills/builtin/data_visualization/__init__.py +4 -0
  269. control_plane_api/app/skills/builtin/data_visualization/skill.py +154 -0
  270. control_plane_api/app/skills/builtin/docker/__init__.py +4 -0
  271. control_plane_api/app/skills/builtin/docker/skill.py +104 -0
  272. control_plane_api/app/skills/builtin/file_generation/__init__.py +4 -0
  273. control_plane_api/app/skills/builtin/file_generation/skill.py +94 -0
  274. control_plane_api/app/skills/builtin/file_system/__init__.py +4 -0
  275. control_plane_api/app/skills/builtin/file_system/skill.py +110 -0
  276. control_plane_api/app/skills/builtin/knowledge_api/__init__.py +5 -0
  277. control_plane_api/app/skills/builtin/knowledge_api/skill.py +124 -0
  278. control_plane_api/app/skills/builtin/python/__init__.py +4 -0
  279. control_plane_api/app/skills/builtin/python/skill.py +92 -0
  280. control_plane_api/app/skills/builtin/remote_filesystem/__init__.py +5 -0
  281. control_plane_api/app/skills/builtin/remote_filesystem/skill.py +170 -0
  282. control_plane_api/app/skills/builtin/shell/__init__.py +4 -0
  283. control_plane_api/app/skills/builtin/shell/skill.py +161 -0
  284. control_plane_api/app/skills/builtin/slack/__init__.py +3 -0
  285. control_plane_api/app/skills/builtin/slack/skill.py +302 -0
  286. control_plane_api/app/skills/builtin/workflow_executor/__init__.py +4 -0
  287. control_plane_api/app/skills/builtin/workflow_executor/skill.py +469 -0
  288. control_plane_api/app/skills/business_intelligence.py +189 -0
  289. control_plane_api/app/skills/config.py +63 -0
  290. control_plane_api/app/skills/loaders/__init__.py +14 -0
  291. control_plane_api/app/skills/loaders/base.py +73 -0
  292. control_plane_api/app/skills/loaders/filesystem_loader.py +199 -0
  293. control_plane_api/app/skills/registry.py +125 -0
  294. control_plane_api/app/utils/helpers.py +12 -0
  295. control_plane_api/app/utils/workflow_executor.py +354 -0
  296. control_plane_api/app/workflows/__init__.py +11 -0
  297. control_plane_api/app/workflows/agent_execution.py +520 -0
  298. control_plane_api/app/workflows/agent_execution_with_skills.py +223 -0
  299. control_plane_api/app/workflows/namespace_provisioning.py +326 -0
  300. control_plane_api/app/workflows/plan_generation.py +254 -0
  301. control_plane_api/app/workflows/team_execution.py +442 -0
  302. control_plane_api/scripts/seed_models.py +240 -0
  303. control_plane_api/scripts/validate_existing_tool_names.py +492 -0
  304. control_plane_api/shared/__init__.py +8 -0
  305. control_plane_api/shared/version.py +17 -0
  306. control_plane_api/test_deduplication.py +274 -0
  307. control_plane_api/test_executor_deduplication_e2e.py +309 -0
  308. control_plane_api/test_job_execution_e2e.py +283 -0
  309. control_plane_api/test_real_integration.py +193 -0
  310. control_plane_api/version.py +38 -0
  311. control_plane_api/worker/__init__.py +0 -0
  312. control_plane_api/worker/activities/__init__.py +0 -0
  313. control_plane_api/worker/activities/agent_activities.py +1585 -0
  314. control_plane_api/worker/activities/approval_activities.py +234 -0
  315. control_plane_api/worker/activities/job_activities.py +199 -0
  316. control_plane_api/worker/activities/runtime_activities.py +1167 -0
  317. control_plane_api/worker/activities/skill_activities.py +282 -0
  318. control_plane_api/worker/activities/team_activities.py +479 -0
  319. control_plane_api/worker/agent_runtime_server.py +370 -0
  320. control_plane_api/worker/binary_manager.py +333 -0
  321. control_plane_api/worker/config/__init__.py +31 -0
  322. control_plane_api/worker/config/worker_config.py +273 -0
  323. control_plane_api/worker/control_plane_client.py +1491 -0
  324. control_plane_api/worker/examples/analytics_integration_example.py +362 -0
  325. control_plane_api/worker/health_monitor.py +159 -0
  326. control_plane_api/worker/metrics.py +237 -0
  327. control_plane_api/worker/models/__init__.py +1 -0
  328. control_plane_api/worker/models/error_events.py +105 -0
  329. control_plane_api/worker/models/inputs.py +89 -0
  330. control_plane_api/worker/runtimes/__init__.py +35 -0
  331. control_plane_api/worker/runtimes/agent_runtime/runtime.py +485 -0
  332. control_plane_api/worker/runtimes/agno/__init__.py +34 -0
  333. control_plane_api/worker/runtimes/agno/config.py +248 -0
  334. control_plane_api/worker/runtimes/agno/hooks.py +385 -0
  335. control_plane_api/worker/runtimes/agno/mcp_builder.py +195 -0
  336. control_plane_api/worker/runtimes/agno/runtime.py +1063 -0
  337. control_plane_api/worker/runtimes/agno/utils.py +163 -0
  338. control_plane_api/worker/runtimes/base.py +979 -0
  339. control_plane_api/worker/runtimes/claude_code/__init__.py +38 -0
  340. control_plane_api/worker/runtimes/claude_code/cleanup.py +184 -0
  341. control_plane_api/worker/runtimes/claude_code/client_pool.py +529 -0
  342. control_plane_api/worker/runtimes/claude_code/config.py +829 -0
  343. control_plane_api/worker/runtimes/claude_code/hooks.py +482 -0
  344. control_plane_api/worker/runtimes/claude_code/litellm_proxy.py +1702 -0
  345. control_plane_api/worker/runtimes/claude_code/mcp_builder.py +467 -0
  346. control_plane_api/worker/runtimes/claude_code/mcp_discovery.py +558 -0
  347. control_plane_api/worker/runtimes/claude_code/runtime.py +1546 -0
  348. control_plane_api/worker/runtimes/claude_code/tool_mapper.py +403 -0
  349. control_plane_api/worker/runtimes/claude_code/utils.py +149 -0
  350. control_plane_api/worker/runtimes/factory.py +173 -0
  351. control_plane_api/worker/runtimes/model_utils.py +107 -0
  352. control_plane_api/worker/runtimes/validation.py +93 -0
  353. control_plane_api/worker/services/__init__.py +1 -0
  354. control_plane_api/worker/services/agent_communication_tools.py +908 -0
  355. control_plane_api/worker/services/agent_executor.py +485 -0
  356. control_plane_api/worker/services/agent_executor_v2.py +793 -0
  357. control_plane_api/worker/services/analytics_collector.py +457 -0
  358. control_plane_api/worker/services/analytics_service.py +464 -0
  359. control_plane_api/worker/services/approval_tools.py +310 -0
  360. control_plane_api/worker/services/approval_tools_agno.py +207 -0
  361. control_plane_api/worker/services/cancellation_manager.py +177 -0
  362. control_plane_api/worker/services/code_ingestion_tools.py +465 -0
  363. control_plane_api/worker/services/contextual_awareness_tools.py +405 -0
  364. control_plane_api/worker/services/data_visualization.py +834 -0
  365. control_plane_api/worker/services/event_publisher.py +531 -0
  366. control_plane_api/worker/services/jira_tools.py +257 -0
  367. control_plane_api/worker/services/remote_filesystem_tools.py +498 -0
  368. control_plane_api/worker/services/runtime_analytics.py +328 -0
  369. control_plane_api/worker/services/session_service.py +365 -0
  370. control_plane_api/worker/services/skill_context_enhancement.py +181 -0
  371. control_plane_api/worker/services/skill_factory.py +471 -0
  372. control_plane_api/worker/services/system_prompt_enhancement.py +410 -0
  373. control_plane_api/worker/services/team_executor.py +715 -0
  374. control_plane_api/worker/services/team_executor_v2.py +1866 -0
  375. control_plane_api/worker/services/tool_enforcement.py +254 -0
  376. control_plane_api/worker/services/workflow_executor/__init__.py +52 -0
  377. control_plane_api/worker/services/workflow_executor/event_processor.py +287 -0
  378. control_plane_api/worker/services/workflow_executor/event_publisher.py +210 -0
  379. control_plane_api/worker/services/workflow_executor/executors/__init__.py +15 -0
  380. control_plane_api/worker/services/workflow_executor/executors/base.py +270 -0
  381. control_plane_api/worker/services/workflow_executor/executors/json_executor.py +50 -0
  382. control_plane_api/worker/services/workflow_executor/executors/python_executor.py +50 -0
  383. control_plane_api/worker/services/workflow_executor/models.py +142 -0
  384. control_plane_api/worker/services/workflow_executor_tools.py +1748 -0
  385. control_plane_api/worker/skills/__init__.py +12 -0
  386. control_plane_api/worker/skills/builtin/context_graph_search/README.md +213 -0
  387. control_plane_api/worker/skills/builtin/context_graph_search/__init__.py +5 -0
  388. control_plane_api/worker/skills/builtin/context_graph_search/agno_impl.py +808 -0
  389. control_plane_api/worker/skills/builtin/context_graph_search/skill.yaml +67 -0
  390. control_plane_api/worker/skills/builtin/contextual_awareness/__init__.py +4 -0
  391. control_plane_api/worker/skills/builtin/contextual_awareness/agno_impl.py +62 -0
  392. control_plane_api/worker/skills/builtin/data_visualization/agno_impl.py +18 -0
  393. control_plane_api/worker/skills/builtin/data_visualization/skill.yaml +84 -0
  394. control_plane_api/worker/skills/builtin/docker/agno_impl.py +65 -0
  395. control_plane_api/worker/skills/builtin/docker/skill.yaml +60 -0
  396. control_plane_api/worker/skills/builtin/file_generation/agno_impl.py +47 -0
  397. control_plane_api/worker/skills/builtin/file_generation/skill.yaml +64 -0
  398. control_plane_api/worker/skills/builtin/file_system/agno_impl.py +32 -0
  399. control_plane_api/worker/skills/builtin/file_system/skill.yaml +54 -0
  400. control_plane_api/worker/skills/builtin/knowledge_api/__init__.py +4 -0
  401. control_plane_api/worker/skills/builtin/knowledge_api/agno_impl.py +50 -0
  402. control_plane_api/worker/skills/builtin/knowledge_api/skill.yaml +66 -0
  403. control_plane_api/worker/skills/builtin/python/agno_impl.py +25 -0
  404. control_plane_api/worker/skills/builtin/python/skill.yaml +60 -0
  405. control_plane_api/worker/skills/builtin/schema_fix_mixin.py +260 -0
  406. control_plane_api/worker/skills/builtin/shell/agno_impl.py +31 -0
  407. control_plane_api/worker/skills/builtin/shell/skill.yaml +60 -0
  408. control_plane_api/worker/skills/builtin/slack/__init__.py +3 -0
  409. control_plane_api/worker/skills/builtin/slack/agno_impl.py +1282 -0
  410. control_plane_api/worker/skills/builtin/slack/skill.yaml +276 -0
  411. control_plane_api/worker/skills/builtin/workflow_executor/agno_impl.py +62 -0
  412. control_plane_api/worker/skills/builtin/workflow_executor/skill.yaml +79 -0
  413. control_plane_api/worker/skills/loaders/__init__.py +5 -0
  414. control_plane_api/worker/skills/loaders/base.py +23 -0
  415. control_plane_api/worker/skills/loaders/filesystem_loader.py +357 -0
  416. control_plane_api/worker/skills/registry.py +208 -0
  417. control_plane_api/worker/tests/__init__.py +1 -0
  418. control_plane_api/worker/tests/conftest.py +12 -0
  419. control_plane_api/worker/tests/e2e/__init__.py +0 -0
  420. control_plane_api/worker/tests/e2e/test_context_graph_real_api.py +338 -0
  421. control_plane_api/worker/tests/e2e/test_context_graph_templates_e2e.py +523 -0
  422. control_plane_api/worker/tests/e2e/test_enforcement_e2e.py +344 -0
  423. control_plane_api/worker/tests/e2e/test_execution_flow.py +571 -0
  424. control_plane_api/worker/tests/e2e/test_single_execution_mode.py +656 -0
  425. control_plane_api/worker/tests/integration/__init__.py +0 -0
  426. control_plane_api/worker/tests/integration/test_builtin_skills_fixes.py +245 -0
  427. control_plane_api/worker/tests/integration/test_context_graph_search_integration.py +365 -0
  428. control_plane_api/worker/tests/integration/test_control_plane_integration.py +308 -0
  429. control_plane_api/worker/tests/integration/test_hook_enforcement_integration.py +579 -0
  430. control_plane_api/worker/tests/integration/test_scheduled_job_workflow.py +237 -0
  431. control_plane_api/worker/tests/integration/test_system_prompt_enhancement_integration.py +343 -0
  432. control_plane_api/worker/tests/unit/__init__.py +0 -0
  433. control_plane_api/worker/tests/unit/test_builtin_skill_autoload.py +396 -0
  434. control_plane_api/worker/tests/unit/test_context_graph_search.py +450 -0
  435. control_plane_api/worker/tests/unit/test_context_graph_templates.py +403 -0
  436. control_plane_api/worker/tests/unit/test_control_plane_client.py +401 -0
  437. control_plane_api/worker/tests/unit/test_control_plane_client_jobs.py +345 -0
  438. control_plane_api/worker/tests/unit/test_job_activities.py +353 -0
  439. control_plane_api/worker/tests/unit/test_skill_context_enhancement.py +321 -0
  440. control_plane_api/worker/tests/unit/test_system_prompt_enhancement.py +415 -0
  441. control_plane_api/worker/tests/unit/test_tool_enforcement.py +324 -0
  442. control_plane_api/worker/utils/__init__.py +1 -0
  443. control_plane_api/worker/utils/chunk_batcher.py +330 -0
  444. control_plane_api/worker/utils/environment.py +65 -0
  445. control_plane_api/worker/utils/error_publisher.py +260 -0
  446. control_plane_api/worker/utils/event_batcher.py +256 -0
  447. control_plane_api/worker/utils/logging_config.py +335 -0
  448. control_plane_api/worker/utils/logging_helper.py +326 -0
  449. control_plane_api/worker/utils/parameter_validator.py +120 -0
  450. control_plane_api/worker/utils/retry_utils.py +60 -0
  451. control_plane_api/worker/utils/streaming_utils.py +665 -0
  452. control_plane_api/worker/utils/tool_validation.py +332 -0
  453. control_plane_api/worker/utils/workspace_manager.py +163 -0
  454. control_plane_api/worker/websocket_client.py +393 -0
  455. control_plane_api/worker/worker.py +1297 -0
  456. control_plane_api/worker/workflows/__init__.py +0 -0
  457. control_plane_api/worker/workflows/agent_execution.py +909 -0
  458. control_plane_api/worker/workflows/scheduled_job_wrapper.py +332 -0
  459. control_plane_api/worker/workflows/team_execution.py +611 -0
  460. kubiya_control_plane_api-0.9.15.dist-info/METADATA +354 -0
  461. kubiya_control_plane_api-0.9.15.dist-info/RECORD +479 -0
  462. kubiya_control_plane_api-0.9.15.dist-info/WHEEL +5 -0
  463. kubiya_control_plane_api-0.9.15.dist-info/entry_points.txt +5 -0
  464. kubiya_control_plane_api-0.9.15.dist-info/licenses/LICENSE +676 -0
  465. kubiya_control_plane_api-0.9.15.dist-info/top_level.txt +3 -0
  466. scripts/__init__.py +1 -0
  467. scripts/migrations.py +39 -0
  468. scripts/seed_worker_queues.py +128 -0
  469. scripts/setup_agent_runtime.py +142 -0
  470. worker_internal/__init__.py +1 -0
  471. worker_internal/planner/__init__.py +1 -0
  472. worker_internal/planner/activities.py +1499 -0
  473. worker_internal/planner/agent_tools.py +197 -0
  474. worker_internal/planner/event_models.py +148 -0
  475. worker_internal/planner/event_publisher.py +67 -0
  476. worker_internal/planner/models.py +199 -0
  477. worker_internal/planner/retry_logic.py +134 -0
  478. worker_internal/planner/worker.py +300 -0
  479. worker_internal/planner/workflows.py +970 -0
@@ -0,0 +1,2555 @@
1
+ """
2
+ Worker Queues router - Manage worker queues within environments.
3
+
4
+ Each environment can have multiple worker queues for fine-grained worker management.
5
+ Task queue naming: {org_id}.{environment_name}.{worker_queue_name}
6
+ """
7
+
8
+ from fastapi import APIRouter, Depends, HTTPException, status, Request
9
+ from fastapi.responses import PlainTextResponse
10
+ from typing import List, Optional, Literal, Dict
11
+ from datetime import datetime, timedelta, timezone
12
+ from pydantic import BaseModel, Field, field_validator
13
+ import structlog
14
+ import uuid
15
+ import os
16
+ import json
17
+ import hashlib
18
+
19
+ from control_plane_api.app.utils.helpers import is_local_temporal
20
+ from control_plane_api.app.middleware.auth import get_current_organization
21
+ from control_plane_api.app.lib.redis_client import get_redis_client
22
+ from control_plane_api.app.database import get_db
23
+ from control_plane_api.app.models.worker import WorkerQueue, WorkerHeartbeat
24
+ from control_plane_api.app.models.environment import Environment
25
+ from control_plane_api.app.models.execution import Execution
26
+ from control_plane_api.app.config import settings
27
+ from control_plane_api.app.schemas.worker_queue_observability_schemas import (
28
+ WorkerQueueMetricsResponse,
29
+ WorkflowsListResponse
30
+ )
31
+ from control_plane_api.app.services.worker_queue_metrics_service import WorkerQueueMetricsService
32
+ from sqlalchemy.orm import Session, joinedload
33
+ from sqlalchemy import desc
34
+ from control_plane_api.app.lib.environment import detect_environment
35
+ from control_plane_api.app.observability import (
36
+ instrument_endpoint,
37
+ create_span_with_context,
38
+ add_span_event,
39
+ add_span_error,
40
+ )
41
+
42
+ logger = structlog.get_logger()
43
+
44
+ router = APIRouter()
45
+
46
+ # Stale worker threshold: Must be >= 2x the heartbeat interval to avoid false negatives
47
+ # Default heartbeat interval is 60s, so threshold is 120s (2x) plus 30s grace period
48
+ # Workers that haven't sent heartbeat in 150s are considered inactive
49
+ STALE_WORKER_THRESHOLD_SECONDS = 150
50
+
51
+
52
+ # LiteLLM Configuration Schemas
53
+ class LiteLLMModelConfig(BaseModel):
54
+ """Single model configuration for LiteLLM proxy"""
55
+ model_name: str = Field(..., description="User-facing model name (e.g., gpt-4)")
56
+ litellm_params: dict = Field(..., description="Parameters passed to litellm.completion() including model, api_base, api_key, etc.")
57
+
58
+
59
+ class LiteLLMConfig(BaseModel):
60
+ """Complete LiteLLM proxy configuration for local worker proxy"""
61
+ model_list: List[LiteLLMModelConfig] = Field(..., description="List of models to configure in the local proxy")
62
+ litellm_settings: Optional[dict] = Field(None, description="LiteLLM settings (callbacks, rate limits, etc.)")
63
+ environment_variables: Optional[dict] = Field(None, description="Environment variables for the proxy (Langfuse keys, etc.)")
64
+
65
+
66
+ class QueueSettings(BaseModel):
67
+ """Worker queue settings schema with validation"""
68
+ enable_local_litellm_proxy: bool = Field(False, description="Enable local LiteLLM proxy for this queue")
69
+ litellm_config: Optional[LiteLLMConfig] = Field(None, description="LiteLLM proxy configuration (required if enable_local_litellm_proxy is true)")
70
+ local_proxy_timeout_seconds: int = Field(10, ge=5, le=60, description="Proxy startup timeout in seconds")
71
+ local_proxy_max_retries: int = Field(3, ge=1, le=10, description="Maximum retry attempts for proxy startup")
72
+
73
+
74
+ async def get_active_workers_from_redis(org_id: str, queue_id: Optional[str] = None, db: Session = None) -> dict:
75
+ """
76
+ Get active workers from Redis heartbeats.
77
+
78
+ Redis heartbeats have automatic TTL (5 minutes), so if a worker hasn't sent a heartbeat
79
+ the key will automatically expire. This eliminates the need to manually mark workers as stale.
80
+
81
+ Args:
82
+ org_id: Organization ID
83
+ queue_id: Optional queue ID to filter by
84
+ db: Database session (optional)
85
+
86
+ Returns:
87
+ Dict with worker_id -> heartbeat_data mapping
88
+ """
89
+ redis_client = get_redis_client()
90
+
91
+ if not redis_client:
92
+ logger.warning("redis_unavailable_for_worker_query", org_id=org_id)
93
+ return {}
94
+
95
+ # If no session provided, create one
96
+ should_close_db = False
97
+ if db is None:
98
+ from control_plane_api.app.database import get_session_local
99
+ SessionLocal = get_session_local()
100
+ db = SessionLocal()
101
+ should_close_db = True
102
+
103
+ try:
104
+ # Get all worker heartbeat keys for this org
105
+ # We need to get worker records from DB to map worker_id -> queue_id
106
+ workers_db = db.query(WorkerHeartbeat).filter(
107
+ WorkerHeartbeat.organization_id == org_id
108
+ ).all()
109
+
110
+ if not workers_db:
111
+ return {}
112
+
113
+ # Filter workers by queue_id if specified
114
+ workers_to_check = []
115
+ worker_queue_map = {}
116
+ # Also track registered_at times (as timezone-aware datetimes)
117
+ worker_registered_at = {}
118
+ for worker in workers_db:
119
+ worker_id = str(worker.id)
120
+ worker_queue_id = str(worker.worker_queue_id) if worker.worker_queue_id else None
121
+
122
+ # Skip if queue_id filter is specified and doesn't match
123
+ if queue_id and worker_queue_id != queue_id:
124
+ continue
125
+
126
+ workers_to_check.append(worker_id)
127
+ worker_queue_map[worker_id] = worker_queue_id
128
+ # Ensure registered_at is timezone-aware for any future comparisons
129
+ if worker.registered_at:
130
+ reg_at = worker.registered_at
131
+ if reg_at.tzinfo is None:
132
+ reg_at = reg_at.replace(tzinfo=timezone.utc)
133
+ worker_registered_at[worker_id] = reg_at
134
+
135
+ if not workers_to_check:
136
+ return {}
137
+
138
+ # Batch fetch all heartbeats in a single Redis pipeline request
139
+ redis_keys = [f"worker:{worker_id}:heartbeat" for worker_id in workers_to_check]
140
+ heartbeat_results = await redis_client.mget(redis_keys)
141
+
142
+ # Process results
143
+ active_workers = {}
144
+ now_utc = datetime.now(timezone.utc) # Pre-compute timezone-aware now
145
+
146
+ for worker_id in workers_to_check:
147
+ redis_key = f"worker:{worker_id}:heartbeat"
148
+ heartbeat_data = heartbeat_results.get(redis_key)
149
+
150
+ if heartbeat_data:
151
+ try:
152
+ data = json.loads(heartbeat_data)
153
+ # Check if heartbeat is recent (within threshold)
154
+ last_heartbeat_str = data.get("last_heartbeat", "")
155
+ if not last_heartbeat_str:
156
+ logger.warning("missing_last_heartbeat", worker_id=worker_id)
157
+ continue
158
+
159
+ # Handle ISO format with 'Z' suffix (Python < 3.11 doesn't handle 'Z')
160
+ if last_heartbeat_str.endswith('Z'):
161
+ last_heartbeat_str = last_heartbeat_str[:-1] + '+00:00'
162
+
163
+ last_heartbeat = datetime.fromisoformat(last_heartbeat_str)
164
+
165
+ # Ensure timezone-aware datetime
166
+ if last_heartbeat.tzinfo is None:
167
+ last_heartbeat = last_heartbeat.replace(tzinfo=timezone.utc)
168
+
169
+ # Calculate age - convert both to timestamps to avoid timezone issues
170
+ try:
171
+ now_ts = datetime.now(timezone.utc).timestamp()
172
+ # Convert last_heartbeat to timestamp
173
+ if last_heartbeat.tzinfo is None:
174
+ last_heartbeat = last_heartbeat.replace(tzinfo=timezone.utc)
175
+ hb_ts = last_heartbeat.timestamp()
176
+ age_seconds = now_ts - hb_ts
177
+ except (TypeError, AttributeError, OSError) as dt_err:
178
+ # If datetime comparison fails, skip this worker
179
+ logger.warning("datetime_comparison_failed", worker_id=worker_id, error=str(dt_err))
180
+ continue
181
+
182
+ if age_seconds <= STALE_WORKER_THRESHOLD_SECONDS:
183
+ active_workers[worker_id] = {
184
+ **data,
185
+ "worker_queue_id": worker_queue_map[worker_id],
186
+ }
187
+ else:
188
+ logger.debug(
189
+ "worker_heartbeat_stale",
190
+ worker_id=worker_id,
191
+ age_seconds=age_seconds,
192
+ threshold=STALE_WORKER_THRESHOLD_SECONDS
193
+ )
194
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
195
+ logger.warning("invalid_heartbeat_data", worker_id=worker_id, error=str(e))
196
+ continue
197
+
198
+ logger.debug(
199
+ "active_workers_fetched",
200
+ org_id=org_id,
201
+ total_workers=len(workers_to_check),
202
+ active_workers=len(active_workers),
203
+ queue_id=queue_id,
204
+ )
205
+
206
+ return active_workers
207
+
208
+ except Exception as e:
209
+ import traceback
210
+ logger.error(
211
+ "failed_to_get_active_workers_from_redis",
212
+ error=str(e),
213
+ org_id=org_id,
214
+ error_type=type(e).__name__,
215
+ line_info=traceback.format_exc().split("\n")[-3] if traceback.format_exc() else "unknown",
216
+ )
217
+ return {}
218
+ finally:
219
+ if should_close_db and db:
220
+ db.close()
221
+
222
+
223
+ # Pydantic schemas
224
+ class WorkerQueueCreate(BaseModel):
225
+ name: str = Field(..., min_length=2, max_length=50, description="Worker queue name (lowercase, no spaces)")
226
+ display_name: Optional[str] = Field(None, description="User-friendly display name")
227
+ description: Optional[str] = Field(None, description="Queue description")
228
+ max_workers: Optional[int] = Field(None, ge=1, description="Max workers allowed (NULL = unlimited)")
229
+ heartbeat_interval: int = Field(60, ge=10, le=300, description="Seconds between heartbeats (lightweight)")
230
+ tags: List[str] = Field(default_factory=list)
231
+ settings: dict = Field(default_factory=dict)
232
+
233
+ @field_validator("settings")
234
+ def validate_settings(cls, v):
235
+ """Validate settings structure including litellm_config"""
236
+ if not v:
237
+ return v
238
+
239
+ try:
240
+ # Validate entire settings dict using QueueSettings schema
241
+ QueueSettings(**v)
242
+ except Exception as e:
243
+ raise ValueError(f"Invalid settings: {str(e)}")
244
+
245
+ # Additional validation: if enable_local_litellm_proxy is true, litellm_config is required
246
+ if v.get("enable_local_litellm_proxy") and not v.get("litellm_config"):
247
+ raise ValueError("litellm_config is required when enable_local_litellm_proxy is true")
248
+
249
+ return v
250
+
251
+
252
+ class WorkerQueueUpdate(BaseModel):
253
+ name: Optional[str] = Field(None, min_length=2, max_length=50)
254
+ display_name: Optional[str] = None
255
+ description: Optional[str] = None
256
+ status: Optional[str] = None
257
+ max_workers: Optional[int] = Field(None, ge=1)
258
+ heartbeat_interval: Optional[int] = Field(None, ge=10, le=300)
259
+ tags: Optional[List[str]] = None
260
+ settings: Optional[dict] = None
261
+
262
+ @field_validator("settings")
263
+ def validate_settings(cls, v):
264
+ """Validate settings structure including litellm_config"""
265
+ if not v:
266
+ return v
267
+
268
+ try:
269
+ # Validate entire settings dict using QueueSettings schema
270
+ QueueSettings(**v)
271
+ except Exception as e:
272
+ raise ValueError(f"Invalid settings: {str(e)}")
273
+
274
+ # Additional validation: if enable_local_litellm_proxy is true, litellm_config is required
275
+ if v.get("enable_local_litellm_proxy") and not v.get("litellm_config"):
276
+ raise ValueError("litellm_config is required when enable_local_litellm_proxy is true")
277
+
278
+ return v
279
+
280
+
281
+ class WorkerQueueResponse(BaseModel):
282
+ id: str
283
+ organization_id: str
284
+ environment_id: str
285
+ name: str
286
+ display_name: Optional[str]
287
+ description: Optional[str]
288
+ status: str
289
+ max_workers: Optional[int]
290
+ heartbeat_interval: int
291
+ tags: List[str]
292
+ settings: dict
293
+ created_at: datetime
294
+ updated_at: datetime
295
+ created_by: Optional[str]
296
+ # Computed
297
+ active_workers: int = 0
298
+ task_queue_name: str # Full task queue name: org.env.worker_queue
299
+
300
+
301
+ @field_validator("id", "environment_id", "created_by", mode="before")
302
+ def cast_to_string(cls, v):
303
+ if v is None:
304
+ return None
305
+ return str(v)
306
+
307
+
308
+ @router.get("/worker-queues", response_model=List[WorkerQueueResponse])
309
+ @instrument_endpoint("worker_queues.list_all_worker_queues")
310
+ async def list_all_worker_queues(
311
+ request: Request,
312
+ organization: dict = Depends(get_current_organization),
313
+ db: Session = Depends(get_db),
314
+ ):
315
+ """List all worker queues across all environments for the organization (excluding ephemeral queues)"""
316
+ try:
317
+ org_id = organization["id"]
318
+
319
+ # Get all non-ephemeral worker queues for this organization with environment relationship
320
+ # Also exclude queues starting with "local-exec" (ephemeral local execution queues)
321
+ queues_db = (
322
+ db.query(WorkerQueue)
323
+ .options(joinedload(WorkerQueue.environment))
324
+ .filter(
325
+ WorkerQueue.organization_id == org_id,
326
+ WorkerQueue.ephemeral == False, # Exclude ephemeral queues
327
+ ~WorkerQueue.name.startswith('local-exec') # Exclude local-exec queues
328
+ )
329
+ .order_by(WorkerQueue.created_at.asc())
330
+ .all()
331
+ )
332
+
333
+ if not queues_db:
334
+ return []
335
+
336
+ # Get active workers from Redis (with automatic TTL-based expiration)
337
+ active_workers = await get_active_workers_from_redis(org_id, db=db)
338
+
339
+ # Count workers per queue
340
+ worker_counts = {}
341
+ for worker_id, worker_data in active_workers.items():
342
+ queue_id = worker_data.get("worker_queue_id")
343
+ if queue_id:
344
+ worker_counts[queue_id] = worker_counts.get(queue_id, 0) + 1
345
+
346
+ # Build response
347
+ queues = []
348
+ for queue in queues_db:
349
+ # Use queue UUID as task queue name for security
350
+ task_queue_name = str(queue.id)
351
+ active_worker_count = worker_counts.get(str(queue.id), 0)
352
+
353
+ # Get environment name from relationship
354
+ environment_name = queue.environment.name if queue.environment else None
355
+
356
+ from sqlalchemy.inspection import inspect
357
+ queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
358
+
359
+ queues.append(
360
+ WorkerQueueResponse(
361
+ **queue_dict,
362
+ active_workers=active_worker_count,
363
+ task_queue_name=task_queue_name,
364
+ environment_name=environment_name,
365
+ )
366
+ )
367
+
368
+ logger.info(
369
+ "all_worker_queues_listed",
370
+ count=len(queues),
371
+ org_id=org_id,
372
+ )
373
+
374
+ return queues
375
+
376
+ except HTTPException:
377
+ raise
378
+ except Exception as e:
379
+ logger.error("all_worker_queues_list_failed", error=str(e), org_id=org_id)
380
+ raise HTTPException(
381
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
382
+ detail=f"Failed to list all worker queues: {str(e)}"
383
+ )
384
+
385
+
386
+ @router.post("/environments/{environment_id}/worker-queues", response_model=WorkerQueueResponse, status_code=status.HTTP_201_CREATED)
387
+ @instrument_endpoint("worker_queues.create_worker_queue")
388
+ async def create_worker_queue(
389
+ environment_id: str,
390
+ queue_data: WorkerQueueCreate,
391
+ request: Request,
392
+ organization: dict = Depends(get_current_organization),
393
+ db: Session = Depends(get_db),
394
+ ):
395
+ """Create a new worker queue within an environment"""
396
+ try:
397
+ org_id = organization["id"]
398
+
399
+ # Validate environment exists
400
+ environment = (
401
+ db.query(Environment)
402
+ .filter(Environment.id == environment_id, Environment.organization_id == org_id)
403
+ .first()
404
+ )
405
+
406
+ if not environment:
407
+ raise HTTPException(
408
+ status_code=status.HTTP_404_NOT_FOUND,
409
+ detail="Environment not found"
410
+ )
411
+
412
+ # Check if worker queue name already exists in this environment
413
+ existing = (
414
+ db.query(WorkerQueue)
415
+ .filter(
416
+ WorkerQueue.environment_id == environment_id,
417
+ WorkerQueue.name == queue_data.name
418
+ )
419
+ .first()
420
+ )
421
+
422
+ if existing:
423
+ raise HTTPException(
424
+ status_code=status.HTTP_409_CONFLICT,
425
+ detail=f"Worker queue '{queue_data.name}' already exists in this environment"
426
+ )
427
+
428
+ # Create worker queue
429
+ queue_id = str(uuid.uuid4())
430
+ now = datetime.now(timezone.utc)
431
+
432
+ # Automatically mark as ephemeral if name starts with "local-exec"
433
+ is_ephemeral = queue_data.name.startswith("local-exec")
434
+
435
+ queue = WorkerQueue(
436
+ id=queue_id,
437
+ organization_id=org_id,
438
+ environment_id=environment_id,
439
+ name=queue_data.name,
440
+ display_name=queue_data.display_name or queue_data.name,
441
+ description=queue_data.description,
442
+ status="active",
443
+ max_workers=queue_data.max_workers,
444
+ heartbeat_interval=queue_data.heartbeat_interval,
445
+ tags=queue_data.tags,
446
+ settings=queue_data.settings,
447
+ ephemeral=is_ephemeral,
448
+ created_at=now,
449
+ updated_at=now,
450
+ created_by=organization.get("user_id"),
451
+ )
452
+
453
+ db.add(queue)
454
+ db.commit()
455
+ db.refresh(queue)
456
+
457
+ # Convert to dict for Pydantic response
458
+ from sqlalchemy.inspection import inspect
459
+ queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
460
+
461
+ # Use queue UUID as task queue name for security (unpredictable)
462
+ task_queue_name = queue_id
463
+
464
+ logger.info(
465
+ "worker_queue_created",
466
+ queue_id=queue_id,
467
+ queue_name=queue.name,
468
+ environment_id=environment_id,
469
+ task_queue_name=task_queue_name,
470
+ org_id=org_id,
471
+ )
472
+
473
+ return WorkerQueueResponse(
474
+ **queue_dict,
475
+ active_workers=0,
476
+ task_queue_name=task_queue_name,
477
+ )
478
+
479
+ except HTTPException:
480
+ raise
481
+ except Exception as e:
482
+ logger.error("worker_queue_creation_failed", error=str(e), org_id=organization["id"])
483
+ raise HTTPException(
484
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
485
+ detail=f"Failed to create worker queue: {str(e)}"
486
+ )
487
+
488
+
489
+ @router.get("/environments/{environment_id}/worker-queues", response_model=List[WorkerQueueResponse])
490
+ @instrument_endpoint("worker_queues.list_worker_queues")
491
+ async def list_worker_queues(
492
+ environment_id: str,
493
+ request: Request,
494
+ organization: dict = Depends(get_current_organization),
495
+ db: Session = Depends(get_db),
496
+ ):
497
+ """List all worker queues in an environment (excluding ephemeral queues)"""
498
+ try:
499
+ org_id = organization["id"]
500
+
501
+ # Get environment name
502
+ environment = (
503
+ db.query(Environment)
504
+ .filter(Environment.id == environment_id, Environment.organization_id == org_id)
505
+ .first()
506
+ )
507
+
508
+ if not environment:
509
+ raise HTTPException(
510
+ status_code=status.HTTP_404_NOT_FOUND,
511
+ detail="Environment not found"
512
+ )
513
+
514
+ environment_name = environment.name
515
+
516
+ # Get non-ephemeral worker queues only
517
+ # Also exclude queues starting with "local-exec" (ephemeral local execution queues)
518
+ queues_db = (
519
+ db.query(WorkerQueue)
520
+ .filter(
521
+ WorkerQueue.environment_id == environment_id,
522
+ WorkerQueue.ephemeral == False, # Exclude ephemeral queues
523
+ ~WorkerQueue.name.startswith('local-exec') # Exclude local-exec queues
524
+ )
525
+ .order_by(WorkerQueue.created_at.asc())
526
+ .all()
527
+ )
528
+
529
+ if not queues_db:
530
+ return []
531
+
532
+ # Get active workers from Redis (with automatic TTL-based expiration)
533
+ active_workers = await get_active_workers_from_redis(org_id, db=db)
534
+
535
+ # Count workers per queue
536
+ worker_counts = {}
537
+ for worker_id, worker_data in active_workers.items():
538
+ queue_id = worker_data.get("worker_queue_id")
539
+ if queue_id:
540
+ worker_counts[queue_id] = worker_counts.get(queue_id, 0) + 1
541
+
542
+ # Build response
543
+ queues = []
544
+ for queue in queues_db:
545
+ # Use queue UUID as task queue name for security
546
+ task_queue_name = str(queue.id)
547
+ active_worker_count = worker_counts.get(str(queue.id), 0)
548
+
549
+ from sqlalchemy.inspection import inspect
550
+ queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
551
+
552
+ queues.append(
553
+ WorkerQueueResponse(
554
+ **queue_dict,
555
+ active_workers=active_worker_count,
556
+ task_queue_name=task_queue_name,
557
+ )
558
+ )
559
+
560
+ logger.info(
561
+ "worker_queues_listed",
562
+ count=len(queues),
563
+ environment_id=environment_id,
564
+ org_id=org_id,
565
+ )
566
+
567
+ return queues
568
+
569
+ except HTTPException:
570
+ raise
571
+ except Exception as e:
572
+ logger.error("worker_queues_list_failed", error=str(e), environment_id=environment_id)
573
+ raise HTTPException(
574
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
575
+ detail=f"Failed to list worker queues: {str(e)}"
576
+ )
577
+
578
+
579
+ @router.get("/worker-queues/{queue_id}", response_model=WorkerQueueResponse)
580
+ @instrument_endpoint("worker_queues.get_worker_queue")
581
+ async def get_worker_queue(
582
+ queue_id: str,
583
+ request: Request,
584
+ organization: dict = Depends(get_current_organization),
585
+ db: Session = Depends(get_db),
586
+ ):
587
+ """Get a specific worker queue by ID"""
588
+ try:
589
+ org_id = organization["id"]
590
+
591
+ # Get worker queue with environment relationship
592
+ queue = (
593
+ db.query(WorkerQueue)
594
+ .options(joinedload(WorkerQueue.environment))
595
+ .filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
596
+ .first()
597
+ )
598
+
599
+ if not queue:
600
+ raise HTTPException(
601
+ status_code=status.HTTP_404_NOT_FOUND,
602
+ detail="Worker queue not found"
603
+ )
604
+
605
+ # Get environment name from relationship
606
+ environment_name = queue.environment.name if queue.environment else "unknown"
607
+
608
+ # Get active workers from Redis for this specific queue
609
+ active_workers_dict = await get_active_workers_from_redis(org_id, queue_id, db=db)
610
+ active_worker_count = len(active_workers_dict)
611
+
612
+ # Convert to dict for Pydantic response
613
+ from sqlalchemy.inspection import inspect
614
+ queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
615
+
616
+ # Use queue UUID as task queue name for security
617
+ task_queue_name = queue_id
618
+
619
+ return WorkerQueueResponse(
620
+ **queue_dict,
621
+ active_workers=active_worker_count,
622
+ task_queue_name=task_queue_name,
623
+ )
624
+
625
+ except HTTPException:
626
+ raise
627
+ except Exception as e:
628
+ logger.error("worker_queue_get_failed", error=str(e), queue_id=queue_id)
629
+ raise HTTPException(
630
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
631
+ detail=f"Failed to get worker queue: {str(e)}"
632
+ )
633
+
634
+
635
+ @router.patch("/worker-queues/{queue_id}", response_model=WorkerQueueResponse)
636
+ @instrument_endpoint("worker_queues.update_worker_queue")
637
+ async def update_worker_queue(
638
+ queue_id: str,
639
+ queue_data: WorkerQueueUpdate,
640
+ request: Request,
641
+ organization: dict = Depends(get_current_organization),
642
+ db: Session = Depends(get_db),
643
+ ):
644
+ """Update a worker queue"""
645
+ try:
646
+ org_id = organization["id"]
647
+
648
+ # Check if queue exists and get it with environment relationship
649
+ queue = (
650
+ db.query(WorkerQueue)
651
+ .options(joinedload(WorkerQueue.environment))
652
+ .filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
653
+ .first()
654
+ )
655
+
656
+ if not queue:
657
+ raise HTTPException(
658
+ status_code=status.HTTP_404_NOT_FOUND,
659
+ detail="Worker queue not found"
660
+ )
661
+
662
+ # Build update dict and apply updates
663
+ update_data = queue_data.model_dump(exclude_unset=True)
664
+ for key, value in update_data.items():
665
+ setattr(queue, key, value)
666
+
667
+ queue.updated_at = datetime.now(timezone.utc)
668
+
669
+ db.commit()
670
+ db.refresh(queue)
671
+
672
+ # Get environment name from relationship
673
+ environment_name = queue.environment.name if queue.environment else "unknown"
674
+
675
+ # Get active workers from Redis for this specific queue
676
+ active_workers_dict = await get_active_workers_from_redis(org_id, queue_id, db=db)
677
+ active_worker_count = len(active_workers_dict)
678
+
679
+ # Convert to dict for Pydantic response
680
+ from sqlalchemy.inspection import inspect
681
+ queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
682
+
683
+ # Use queue UUID as task queue name for security
684
+ task_queue_name = queue_id
685
+
686
+ logger.info(
687
+ "worker_queue_updated",
688
+ queue_id=queue_id,
689
+ org_id=org_id,
690
+ )
691
+
692
+ return WorkerQueueResponse(
693
+ **queue_dict,
694
+ active_workers=active_worker_count,
695
+ task_queue_name=task_queue_name,
696
+ )
697
+
698
+ except HTTPException:
699
+ raise
700
+ except Exception as e:
701
+ logger.error("worker_queue_update_failed", error=str(e), queue_id=queue_id)
702
+ raise HTTPException(
703
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
704
+ detail=f"Failed to update worker queue: {str(e)}"
705
+ )
706
+
707
+
708
+ @router.delete("/worker-queues/{queue_id}", status_code=status.HTTP_204_NO_CONTENT)
709
+ @instrument_endpoint("worker_queues.delete_worker_queue")
710
+ async def delete_worker_queue(
711
+ queue_id: str,
712
+ request: Request,
713
+ organization: dict = Depends(get_current_organization),
714
+ db: Session = Depends(get_db),
715
+ ):
716
+ """Delete a worker queue"""
717
+ try:
718
+ org_id = organization["id"]
719
+
720
+ # Prevent deleting default queue and check if queue exists
721
+ queue = (
722
+ db.query(WorkerQueue)
723
+ .filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
724
+ .first()
725
+ )
726
+
727
+ if not queue:
728
+ raise HTTPException(
729
+ status_code=status.HTTP_404_NOT_FOUND,
730
+ detail="Worker queue not found"
731
+ )
732
+
733
+ if queue.name == "default":
734
+ raise HTTPException(
735
+ status_code=status.HTTP_400_BAD_REQUEST,
736
+ detail="Cannot delete the default worker queue"
737
+ )
738
+
739
+ # Check for active workers in Redis
740
+ active_workers = await get_active_workers_from_redis(org_id, queue_id, db=db)
741
+
742
+ if active_workers:
743
+ raise HTTPException(
744
+ status_code=status.HTTP_400_BAD_REQUEST,
745
+ detail=f"Cannot delete worker queue with {len(active_workers)} active workers"
746
+ )
747
+
748
+ # Delete queue
749
+ db.delete(queue)
750
+ db.commit()
751
+
752
+ logger.info("worker_queue_deleted", queue_id=queue_id, org_id=org_id)
753
+
754
+ return None
755
+
756
+ except HTTPException:
757
+ raise
758
+ except Exception as e:
759
+ logger.error("worker_queue_delete_failed", error=str(e), queue_id=queue_id)
760
+ raise HTTPException(
761
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
762
+ detail=f"Failed to delete worker queue: {str(e)}"
763
+ )
764
+
765
+
766
+ @router.get("/worker-queues/{queue_id}/install-script")
767
+ @instrument_endpoint("worker_queues.get_installation_script")
768
+ async def get_installation_script(
769
+ queue_id: str,
770
+ deployment_type: Literal["docker", "kubernetes", "openshift", "local"] = "local",
771
+ request: Request = None,
772
+ organization: dict = Depends(get_current_organization),
773
+ db: Session = Depends(get_db),
774
+ ):
775
+ """
776
+ Generate an installation script for setting up a worker for this queue.
777
+
778
+ Supports multiple deployment types:
779
+ - local: Python virtual environment setup
780
+ - docker: Docker run command
781
+ - kubernetes: Kubernetes deployment YAML
782
+ - openshift: OpenShift deployment YAML
783
+ """
784
+ try:
785
+ org_id = organization["id"]
786
+
787
+ # Get worker queue details with environment relationship
788
+ queue = (
789
+ db.query(WorkerQueue)
790
+ .options(joinedload(WorkerQueue.environment))
791
+ .filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
792
+ .first()
793
+ )
794
+
795
+ if not queue:
796
+ raise HTTPException(
797
+ status_code=status.HTTP_404_NOT_FOUND,
798
+ detail="Worker queue not found"
799
+ )
800
+
801
+ # Get environment name from relationship
802
+ environment_name = "default"
803
+ if queue.environment:
804
+ environment_name = queue.environment.name
805
+
806
+ queue_name = queue.name
807
+
808
+ # Get control plane URL from the request that reached us
809
+ # This ensures installation scripts use the correct URL
810
+ control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
811
+
812
+ # Generate new worker ID
813
+ worker_id = str(uuid.uuid4())
814
+
815
+ # Generate script based on deployment type
816
+ if deployment_type == "local":
817
+ script = _generate_local_script(worker_id, control_plane_url)
818
+ elif deployment_type == "docker":
819
+ script = _generate_docker_script(worker_id, control_plane_url, queue_name, environment_name)
820
+ elif deployment_type == "kubernetes":
821
+ script = _generate_kubernetes_script(worker_id, control_plane_url, queue_name, environment_name)
822
+ elif deployment_type == "openshift":
823
+ script = _generate_openshift_script(worker_id, control_plane_url, queue_name, environment_name)
824
+ else:
825
+ raise HTTPException(
826
+ status_code=status.HTTP_400_BAD_REQUEST,
827
+ detail=f"Unsupported deployment type: {deployment_type}"
828
+ )
829
+
830
+ logger.info(
831
+ "installation_script_generated",
832
+ queue_id=queue_id,
833
+ deployment_type=deployment_type,
834
+ worker_id=worker_id,
835
+ org_id=org_id,
836
+ )
837
+
838
+ return PlainTextResponse(content=script, media_type="text/plain")
839
+
840
+ except HTTPException:
841
+ raise
842
+ except Exception as e:
843
+ logger.error("installation_script_generation_failed", error=str(e), queue_id=queue_id)
844
+ raise HTTPException(
845
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
846
+ detail=f"Failed to generate installation script: {str(e)}"
847
+ )
848
+
849
+
850
+ class WorkerSystemInfo(BaseModel):
851
+ """Worker system information"""
852
+ hostname: Optional[str] = None
853
+ platform: Optional[str] = None
854
+ os_name: Optional[str] = None
855
+ os_version: Optional[str] = None
856
+ python_version: Optional[str] = None
857
+ cli_version: Optional[str] = None
858
+ sdk_version: Optional[str] = None # Worker SDK version
859
+ pid: Optional[int] = None # Process ID
860
+ cwd: Optional[str] = None # Current working directory
861
+ supported_runtimes: Optional[List[str]] = None # Available runtimes (e.g., ["agno", "claude_code"])
862
+ llm_gateway_url: Optional[str] = None # LiteLLM/LLM gateway URL
863
+ docker_available: Optional[bool] = None
864
+ docker_version: Optional[str] = None
865
+ cpu_count: Optional[int] = None
866
+ cpu_percent: Optional[float] = None
867
+ memory_total: Optional[int] = None
868
+ memory_used: Optional[int] = None
869
+ memory_percent: Optional[float] = None
870
+ disk_total: Optional[int] = None
871
+ disk_used: Optional[int] = None
872
+ disk_percent: Optional[float] = None
873
+ uptime_seconds: Optional[float] = None
874
+
875
+
876
+ class WorkerStartRequest(BaseModel):
877
+ """Worker start request with SDK version and system info"""
878
+ worker_sdk_version: Optional[str] = None
879
+ system_info: Optional[WorkerSystemInfo] = None
880
+ control_plane_url: Optional[str] = None
881
+
882
+
883
+ class WorkerStartResponse(BaseModel):
884
+ """Worker start configuration"""
885
+ worker_id: str
886
+ task_queue_name: str # The queue UUID
887
+ temporal_namespace: str
888
+ temporal_host: str
889
+ temporal_api_key: str
890
+ organization_id: str
891
+ control_plane_url: str
892
+ heartbeat_interval: int
893
+ # LiteLLM configuration for agno workflows/activities
894
+ litellm_api_url: str
895
+ litellm_api_key: str
896
+ # Queue metadata
897
+ queue_name: str
898
+ environment_name: str
899
+ queue_id: str # Queue UUID for cleanup
900
+ queue_ephemeral: bool = False # Whether queue is ephemeral
901
+ queue_single_execution: bool = False # Whether queue is for single execution
902
+ # Redis configuration for direct event streaming (default fast path)
903
+ redis_url: Optional[str] = None
904
+ redis_password: Optional[str] = None
905
+ redis_enabled: bool = False
906
+ # WebSocket configuration for per-worker persistent connections
907
+ websocket_enabled: bool = True
908
+ websocket_url: Optional[str] = None
909
+ websocket_features: List[str] = Field(default_factory=lambda: ["events", "control", "heartbeat", "config_update"])
910
+ # NATS configuration for high-performance event bus (optional)
911
+ nats_config: Optional[Dict[str, str]] = None
912
+ # SDK version for compatibility check
913
+ control_plane_sdk_version: str
914
+
915
+
916
+ @router.post("/worker-queues/{queue_id}/start", response_model=WorkerStartResponse)
917
+ @instrument_endpoint("worker_queues.start_worker_for_queue")
918
+ async def start_worker_for_queue(
919
+ queue_id: str,
920
+ request: Request,
921
+ body: WorkerStartRequest = WorkerStartRequest(),
922
+ organization: dict = Depends(get_current_organization),
923
+ db: Session = Depends(get_db),
924
+ ):
925
+ """
926
+ Start a worker for a specific queue.
927
+
928
+ This endpoint is called by the CLI with: kubiya worker start --queue-id={queue_id}
929
+
930
+ Returns all configuration needed for the worker to connect to Temporal.
931
+ """
932
+ # Get control plane SDK version for compatibility check
933
+ from control_plane_api.version import get_sdk_version
934
+ control_plane_sdk_version = get_sdk_version()
935
+
936
+ # Log worker SDK version if provided
937
+ if body.worker_sdk_version:
938
+ logger.info(
939
+ "worker_registration_with_version",
940
+ queue_id=queue_id,
941
+ worker_sdk_version=body.worker_sdk_version,
942
+ control_plane_sdk_version=control_plane_sdk_version,
943
+ )
944
+
945
+ try:
946
+ org_id = organization["id"]
947
+
948
+ # Get worker queue with environment relationship
949
+ queue = (
950
+ db.query(WorkerQueue)
951
+ .options(joinedload(WorkerQueue.environment))
952
+ .filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
953
+ .first()
954
+ )
955
+
956
+ if not queue:
957
+ # Check if queue exists at all (might be in different org)
958
+ queue_check = (
959
+ db.query(WorkerQueue)
960
+ .filter(WorkerQueue.id == queue_id)
961
+ .first()
962
+ )
963
+
964
+ if queue_check:
965
+ raise HTTPException(
966
+ status_code=status.HTTP_403_FORBIDDEN,
967
+ detail=f"Worker queue '{queue_id}' not found in your organization"
968
+ )
969
+ else:
970
+ raise HTTPException(
971
+ status_code=status.HTTP_404_NOT_FOUND,
972
+ detail=f"Worker queue '{queue_id}' does not exist. Please create a queue from the UI first."
973
+ )
974
+
975
+ # Check if environment is configured
976
+ if not queue.environment_id:
977
+ raise HTTPException(
978
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
979
+ detail=f"Worker queue '{queue.name}' has no environment configured. Please contact support."
980
+ )
981
+
982
+ if not queue.environment:
983
+ raise HTTPException(
984
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
985
+ detail=f"Environment configuration error for queue '{queue.name}'. Please contact support."
986
+ )
987
+
988
+ environment_name = queue.environment.name
989
+
990
+ # Check if queue is active
991
+ if queue.status != "active":
992
+ raise HTTPException(
993
+ status_code=status.HTTP_400_BAD_REQUEST,
994
+ detail=f"Worker queue is not active (status: {queue.status})"
995
+ )
996
+
997
+ # Get organization-specific Temporal credentials
998
+ import os
999
+ from control_plane_api.app.lib.temporal_credentials_service import (
1000
+ get_temporal_credentials_for_org,
1001
+ is_local_temporal
1002
+ )
1003
+
1004
+ org_id = organization["id"]
1005
+ token = request.state.kubiya_token
1006
+
1007
+ # Check if local Temporal (for development)
1008
+ if is_local_temporal():
1009
+ logger.info("using_local_temporal_config", queue_id=queue_id, org_id=org_id)
1010
+ temporal_credentials = {
1011
+ "namespace": os.getenv("TEMPORAL_NAMESPACE", "default"),
1012
+ "api_key": "",
1013
+ "host": os.getenv("TEMPORAL_HOST", "localhost:7233"),
1014
+ "org": org_id,
1015
+ }
1016
+ else:
1017
+ # Fetch org-specific credentials from Kubiya API
1018
+ # use_fallback=True for backwards compatibility during rollout
1019
+ try:
1020
+ temporal_credentials = await get_temporal_credentials_for_org(
1021
+ org_id=org_id,
1022
+ token=token,
1023
+ use_fallback=True # Enable fallback during migration
1024
+ )
1025
+
1026
+ logger.info(
1027
+ "temporal_credentials_fetched_for_worker",
1028
+ queue_id=queue_id,
1029
+ org_id=org_id,
1030
+ namespace=temporal_credentials["namespace"],
1031
+ source="kubiya_api"
1032
+ )
1033
+ except Exception as e:
1034
+ logger.error(
1035
+ "temporal_credentials_fetch_failed",
1036
+ queue_id=queue_id,
1037
+ org_id=org_id,
1038
+ error=str(e)
1039
+ )
1040
+ # If fallback is enabled, this won't raise; if disabled, it will
1041
+ raise HTTPException(
1042
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
1043
+ detail="Failed to fetch Temporal credentials. Please contact support."
1044
+ )
1045
+
1046
+ # For backwards compatibility with existing code
1047
+ namespace = {
1048
+ "namespace_name": temporal_credentials["namespace"],
1049
+ "api_key_encrypted": temporal_credentials["api_key"],
1050
+ }
1051
+
1052
+ # Generate worker ID
1053
+ worker_id = str(uuid.uuid4())
1054
+
1055
+ # Use worker's provided URL (preserves user configuration)
1056
+ # Fallback to request URL for backward compatibility with old workers
1057
+ if body.control_plane_url:
1058
+ control_plane_url = body.control_plane_url.rstrip("/")
1059
+ logger.info(
1060
+ "using_worker_provided_control_plane_url",
1061
+ queue_id=queue_id,
1062
+ worker_url=control_plane_url,
1063
+ request_url=f"{request.url.scheme}://{request.url.netloc}"
1064
+ )
1065
+ else:
1066
+ # Backward compatibility for old workers
1067
+ control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
1068
+ logger.info(
1069
+ "using_request_derived_control_plane_url",
1070
+ queue_id=queue_id,
1071
+ control_plane_url=control_plane_url
1072
+ )
1073
+ temporal_host = temporal_credentials["host"]
1074
+
1075
+ # Get LiteLLM configuration for agno workflows/activities
1076
+ litellm_api_url = os.getenv("LITELLM_API_URL", "https://llm-proxy.kubiya.ai")
1077
+ litellm_api_key = os.getenv("LITELLM_API_KEY", "")
1078
+
1079
+ # Create worker heartbeat record
1080
+
1081
+ now = datetime.now(timezone.utc)
1082
+ worker_metadata = {}
1083
+ if body.system_info:
1084
+ worker_metadata = body.system_info.model_dump(exclude_none=True)
1085
+ logger.info(
1086
+ "worker_registration_with_system_info",
1087
+ worker_id=worker_id[:8],
1088
+ hostname=worker_metadata.get("hostname"),
1089
+ sdk_version=worker_metadata.get("sdk_version"),
1090
+ pid=worker_metadata.get("pid"),
1091
+ cwd=worker_metadata.get("cwd"),
1092
+ )
1093
+
1094
+ # Add LLM gateway URL from control plane config
1095
+ worker_metadata["llm_gateway_url"] = litellm_api_url
1096
+
1097
+ worker_heartbeat = WorkerHeartbeat(
1098
+ id=worker_id,
1099
+ worker_id=worker_id,
1100
+ organization_id=org_id,
1101
+ worker_queue_id=queue_id,
1102
+ environment_name=environment_name,
1103
+ status="active",
1104
+ tasks_processed=0,
1105
+ registered_at=now,
1106
+ last_heartbeat=now,
1107
+ updated_at=now,
1108
+ worker_metadata={},
1109
+ )
1110
+
1111
+ db.add(worker_heartbeat)
1112
+ db.commit()
1113
+
1114
+ # Task queue name is just the queue UUID for security
1115
+ task_queue_name = queue_id
1116
+
1117
+ # Determine WebSocket configuration
1118
+ # WebSocket is only supported when control plane is NOT in serverless environment
1119
+ # (Vercel, AWS Lambda, etc. don't support persistent WebSocket connections)
1120
+ control_plane_env = detect_environment()
1121
+ websocket_enabled = (
1122
+ os.getenv("WEBSOCKET_ENABLED", "true").lower() == "true"
1123
+ and control_plane_env == "standard"
1124
+ )
1125
+ websocket_url = None
1126
+
1127
+ if websocket_enabled:
1128
+ # Convert HTTP(S) to WS(S) for WebSocket URL
1129
+ ws_base = control_plane_url.replace("https://", "wss://").replace("http://", "ws://")
1130
+ websocket_url = f"{ws_base}/api/v1/ws/workers/{worker_id}"
1131
+
1132
+ if not websocket_enabled and control_plane_env == "serverless":
1133
+ logger.info(
1134
+ "websocket_disabled_serverless_control_plane",
1135
+ worker_id=worker_id[:8],
1136
+ environment=control_plane_env
1137
+ )
1138
+
1139
+ # Redis configuration for direct event streaming (default fast path)
1140
+ # Workers will use Redis directly instead of HTTP endpoint for better performance
1141
+ redis_url = None
1142
+ redis_password = None
1143
+ redis_enabled = False
1144
+
1145
+ if settings.redis_url:
1146
+ redis_url = settings.redis_url
1147
+ redis_enabled = True
1148
+
1149
+ # Extract password from Redis URL if present (redis://:password@host:port/db)
1150
+ if "@" in redis_url and ":" in redis_url:
1151
+ try:
1152
+ # Parse URL to extract password
1153
+ from urllib.parse import urlparse
1154
+ parsed = urlparse(redis_url)
1155
+ if parsed.password:
1156
+ redis_password = parsed.password
1157
+ except Exception as e:
1158
+ logger.warning(
1159
+ "redis_password_extraction_failed",
1160
+ error=str(e),
1161
+ worker_id=worker_id[:8],
1162
+ )
1163
+
1164
+ logger.info(
1165
+ "redis_config_provided_to_worker",
1166
+ worker_id=worker_id[:8],
1167
+ redis_url=redis_url.split("@")[-1] if "@" in redis_url else redis_url, # Log without password
1168
+ )
1169
+
1170
+ # NATS configuration (optional, enterprise-grade event bus)
1171
+ nats_config = None
1172
+ if (
1173
+ hasattr(settings, "event_bus")
1174
+ and settings.event_bus
1175
+ and isinstance(settings.event_bus, dict)
1176
+ and "nats" in settings.event_bus
1177
+ and settings.event_bus["nats"].get("enabled", False)
1178
+ ):
1179
+ try:
1180
+ from control_plane_api.app.lib.nats import NATSCredentialsManager
1181
+
1182
+ # Get NATS operator credentials from settings/env
1183
+ nats_operator_jwt = os.getenv("NATS_OPERATOR_JWT")
1184
+ nats_operator_seed = os.getenv("NATS_OPERATOR_SEED")
1185
+
1186
+ if nats_operator_jwt and nats_operator_seed:
1187
+ # Create credentials manager
1188
+ creds_manager = NATSCredentialsManager(
1189
+ operator_jwt=nats_operator_jwt,
1190
+ operator_seed=nats_operator_seed,
1191
+ )
1192
+
1193
+ # Generate temporary worker credentials (24-hour TTL)
1194
+ worker_creds = creds_manager.create_worker_credentials(
1195
+ worker_id=worker_id,
1196
+ organization_id=org_id,
1197
+ ttl_hours=24,
1198
+ )
1199
+
1200
+ # Get NATS URL from config
1201
+ nats_url = settings.event_bus["nats"].get("nats_url")
1202
+
1203
+ # Build NATS config for worker
1204
+ nats_config = {
1205
+ "nats_url": nats_url,
1206
+ "nats_jwt": worker_creds.jwt,
1207
+ "nats_seed": worker_creds.seed,
1208
+ "subject_prefix": worker_creds.subject_prefix,
1209
+ "organization_id": org_id,
1210
+ "worker_id": worker_id,
1211
+ "jetstream_enabled": str(
1212
+ settings.event_bus["nats"].get("jetstream_enabled", True)
1213
+ ),
1214
+ "expires_at": worker_creds.expires_at.isoformat(),
1215
+ }
1216
+
1217
+ logger.info(
1218
+ "nats_credentials_generated_for_worker",
1219
+ worker_id=worker_id[:8],
1220
+ organization_id=org_id,
1221
+ subject_prefix=worker_creds.subject_prefix,
1222
+ expires_at=worker_creds.expires_at.isoformat(),
1223
+ )
1224
+ else:
1225
+ logger.warning(
1226
+ "nats_operator_credentials_not_configured",
1227
+ message="NATS enabled but NATS_OPERATOR_JWT or NATS_OPERATOR_SEED not set",
1228
+ )
1229
+
1230
+ except ImportError:
1231
+ logger.warning(
1232
+ "nats_dependency_missing",
1233
+ message="NATS credentials generation skipped - nkeys not installed",
1234
+ )
1235
+ except Exception as e:
1236
+ logger.error(
1237
+ "nats_credentials_generation_failed",
1238
+ error=str(e),
1239
+ worker_id=worker_id[:8],
1240
+ )
1241
+
1242
+ logger.info(
1243
+ "worker_started_for_queue",
1244
+ worker_id=worker_id,
1245
+ queue_id=queue_id,
1246
+ task_queue_name=task_queue_name,
1247
+ org_id=org_id,
1248
+ websocket_enabled=websocket_enabled,
1249
+ nats_enabled=nats_config is not None,
1250
+ )
1251
+
1252
+ return WorkerStartResponse(
1253
+ worker_id=worker_id,
1254
+ task_queue_name=task_queue_name,
1255
+ temporal_namespace=namespace["namespace_name"],
1256
+ temporal_host=temporal_host,
1257
+ temporal_api_key=namespace["api_key_encrypted"],
1258
+ organization_id=org_id,
1259
+ control_plane_url=control_plane_url,
1260
+ heartbeat_interval=queue.heartbeat_interval or 60,
1261
+ litellm_api_url=litellm_api_url,
1262
+ litellm_api_key=litellm_api_key,
1263
+ queue_name=queue.name,
1264
+ environment_name=environment_name,
1265
+ queue_id=queue_id,
1266
+ queue_ephemeral=queue.ephemeral or False,
1267
+ queue_single_execution=queue.single_execution_mode or False,
1268
+ redis_url=redis_url,
1269
+ redis_password=redis_password,
1270
+ redis_enabled=redis_enabled,
1271
+ websocket_enabled=websocket_enabled,
1272
+ websocket_url=websocket_url,
1273
+ websocket_features=["events", "control", "heartbeat", "config_update"],
1274
+ nats_config=nats_config,
1275
+ control_plane_sdk_version=control_plane_sdk_version,
1276
+ )
1277
+
1278
+ except HTTPException:
1279
+ raise
1280
+ except Exception as e:
1281
+ logger.error(
1282
+ "worker_start_for_queue_failed",
1283
+ error=str(e),
1284
+ error_type=type(e).__name__,
1285
+ queue_id=queue_id,
1286
+ org_id=organization.get("id")
1287
+ )
1288
+ raise HTTPException(
1289
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
1290
+ detail=f"Failed to start worker due to an internal error. Please try again or contact support. (Error ID: {queue_id[:8]})"
1291
+ )
1292
+
1293
+
1294
+ def _generate_local_script(worker_id: str, control_plane_url: str) -> str:
1295
+ """Generate a bash script for local Python installation"""
1296
+ return f"""#!/bin/bash
1297
+ # Kubiya Agent Worker - Local Installation Script
1298
+ # Generated: {datetime.now(timezone.utc).isoformat()}
1299
+
1300
+ set -e
1301
+
1302
+ echo "🚀 Setting up Kubiya Agent Worker..."
1303
+ echo ""
1304
+
1305
+ # Configuration
1306
+ WORKER_ID="{worker_id}"
1307
+ CONTROL_PLANE_URL="{control_plane_url}"
1308
+
1309
+ # Check if KUBIYA_API_KEY is set
1310
+ if [ -z "$KUBIYA_API_KEY" ]; then
1311
+ echo "❌ Error: KUBIYA_API_KEY environment variable is not set"
1312
+ echo "Please set it with: export KUBIYA_API_KEY=your-api-key"
1313
+ exit 1
1314
+ fi
1315
+
1316
+ # Check Python version
1317
+ if ! command -v python3 &> /dev/null; then
1318
+ echo "❌ Error: Python 3 is not installed"
1319
+ exit 1
1320
+ fi
1321
+
1322
+ PYTHON_VERSION=$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2)
1323
+ echo "✓ Found Python $PYTHON_VERSION"
1324
+
1325
+ # Create directory
1326
+ WORKER_DIR="$HOME/.kubiya/workers/$WORKER_ID"
1327
+ mkdir -p "$WORKER_DIR"
1328
+ cd "$WORKER_DIR"
1329
+
1330
+ echo "✓ Created worker directory: $WORKER_DIR"
1331
+
1332
+ # Create virtual environment
1333
+ echo "📦 Creating virtual environment..."
1334
+ python3 -m venv venv
1335
+ source venv/bin/activate
1336
+
1337
+ # Install worker package (includes all dependencies from pyproject.toml)
1338
+ echo "📦 Installing worker package..."
1339
+ if command -v uv &> /dev/null; then
1340
+ echo "✓ Using uv (fast mode)"
1341
+ uv pip install --quiet kubiya-control-plane-api[worker]
1342
+ else
1343
+ echo "ℹ️ Using pip (consider installing uv: https://github.com/astral-sh/uv)"
1344
+ pip install --quiet --upgrade pip
1345
+ pip install --quiet kubiya-control-plane-api[worker]
1346
+ fi
1347
+
1348
+ echo "✓ Worker package installed"
1349
+
1350
+ # Create systemd service file (optional)
1351
+ cat > kubiya-worker.service <<EOF
1352
+ [Unit]
1353
+ Description=Kubiya Agent Worker
1354
+ After=network.target
1355
+
1356
+ [Service]
1357
+ Type=simple
1358
+ User=$USER
1359
+ WorkingDirectory=$WORKER_DIR
1360
+ Environment="WORKER_ID=$WORKER_ID"
1361
+ Environment="KUBIYA_API_KEY=$KUBIYA_API_KEY"
1362
+ Environment="CONTROL_PLANE_URL=$CONTROL_PLANE_URL"
1363
+ ExecStart=$WORKER_DIR/venv/bin/python $WORKER_DIR/worker.py
1364
+ Restart=always
1365
+ RestartSec=10
1366
+
1367
+ [Install]
1368
+ WantedBy=multi-user.target
1369
+ EOF
1370
+
1371
+ echo "✓ Systemd service file created (optional)"
1372
+
1373
+ # Create run script
1374
+ cat > run.sh <<EOF
1375
+ #!/bin/bash
1376
+ cd "$WORKER_DIR"
1377
+ source venv/bin/activate
1378
+ export WORKER_ID="$WORKER_ID"
1379
+ export KUBIYA_API_KEY="$KUBIYA_API_KEY"
1380
+ export CONTROL_PLANE_URL="$CONTROL_PLANE_URL"
1381
+ python worker.py
1382
+ EOF
1383
+
1384
+ chmod +x run.sh
1385
+
1386
+ echo ""
1387
+ echo "✅ Installation complete!"
1388
+ echo ""
1389
+ echo "To start the worker:"
1390
+ echo " cd $WORKER_DIR && ./run.sh"
1391
+ echo ""
1392
+ echo "Or to install as a systemd service:"
1393
+ echo " sudo cp $WORKER_DIR/kubiya-worker.service /etc/systemd/system/"
1394
+ echo " sudo systemctl daemon-reload"
1395
+ echo " sudo systemctl enable kubiya-worker"
1396
+ echo " sudo systemctl start kubiya-worker"
1397
+ echo ""
1398
+ """
1399
+
1400
+
1401
+ def _generate_docker_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
1402
+ """Generate Docker commands for running the worker"""
1403
+ return f"""# Kubiya Agent Worker - Docker Installation
1404
+ # Generated: {datetime.now(timezone.utc).isoformat()}
1405
+
1406
+ # Configuration
1407
+ WORKER_ID="{worker_id}"
1408
+ CONTROL_PLANE_URL="{control_plane_url}"
1409
+ QUEUE_NAME="{queue_name}"
1410
+ ENVIRONMENT_NAME="{environment_name}"
1411
+
1412
+ # Make sure to set your API key
1413
+ # export KUBIYA_API_KEY=your-api-key
1414
+
1415
+ # Run with Docker
1416
+ docker run -d \\
1417
+ --name kubiya-worker-{queue_name}-{worker_id[:8]} \\
1418
+ --restart unless-stopped \\
1419
+ -e WORKER_ID="$WORKER_ID" \\
1420
+ -e KUBIYA_API_KEY="$KUBIYA_API_KEY" \\
1421
+ -e CONTROL_PLANE_URL="$CONTROL_PLANE_URL" \\
1422
+ -e LOG_LEVEL="INFO" \\
1423
+ kubiya/agent-worker:latest
1424
+
1425
+ # Check logs
1426
+ # docker logs -f kubiya-worker-{queue_name}-{worker_id[:8]}
1427
+
1428
+ # Stop worker
1429
+ # docker stop kubiya-worker-{queue_name}-{worker_id[:8]}
1430
+
1431
+ # Remove worker
1432
+ # docker rm kubiya-worker-{queue_name}-{worker_id[:8]}
1433
+
1434
+ # Docker Compose (save as docker-compose.yml)
1435
+ cat > docker-compose.yml <<EOF
1436
+ version: '3.8'
1437
+
1438
+ services:
1439
+ worker:
1440
+ image: kubiya/agent-worker:latest
1441
+ container_name: kubiya-worker-{queue_name}
1442
+ restart: unless-stopped
1443
+ environment:
1444
+ - WORKER_ID={worker_id}
1445
+ - KUBIYA_API_KEY=${{KUBIYA_API_KEY}}
1446
+ - CONTROL_PLANE_URL={control_plane_url}
1447
+ - LOG_LEVEL=INFO
1448
+ healthcheck:
1449
+ test: ["CMD", "python", "-c", "import httpx; httpx.get('{control_plane_url}/health')"]
1450
+ interval: 30s
1451
+ timeout: 10s
1452
+ retries: 3
1453
+ start_period: 10s
1454
+ EOF
1455
+
1456
+ # To use docker-compose:
1457
+ # docker-compose up -d
1458
+ """
1459
+
1460
+
1461
+ def _generate_kubernetes_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
1462
+ """Generate Kubernetes deployment YAML"""
1463
+ return f"""# Kubiya Agent Worker - Kubernetes Deployment
1464
+ # Generated: {datetime.now(timezone.utc).isoformat()}
1465
+ #
1466
+ # To deploy:
1467
+ # 1. Create secret: kubectl create secret generic kubiya-worker-secret --from-literal=api-key=YOUR_API_KEY
1468
+ # 2. Apply this file: kubectl apply -f kubiya-worker.yaml
1469
+ #
1470
+ ---
1471
+ apiVersion: v1
1472
+ kind: ConfigMap
1473
+ metadata:
1474
+ name: kubiya-worker-{queue_name}-config
1475
+ labels:
1476
+ app: kubiya-worker
1477
+ queue: {queue_name}
1478
+ environment: {environment_name}
1479
+ data:
1480
+ WORKER_ID: "{worker_id}"
1481
+ CONTROL_PLANE_URL: "{control_plane_url}"
1482
+ LOG_LEVEL: "INFO"
1483
+
1484
+ ---
1485
+ apiVersion: apps/v1
1486
+ kind: Deployment
1487
+ metadata:
1488
+ name: kubiya-worker-{queue_name}
1489
+ labels:
1490
+ app: kubiya-worker
1491
+ queue: {queue_name}
1492
+ environment: {environment_name}
1493
+ spec:
1494
+ replicas: 1
1495
+ selector:
1496
+ matchLabels:
1497
+ app: kubiya-worker
1498
+ queue: {queue_name}
1499
+ template:
1500
+ metadata:
1501
+ labels:
1502
+ app: kubiya-worker
1503
+ queue: {queue_name}
1504
+ environment: {environment_name}
1505
+ spec:
1506
+ containers:
1507
+ - name: worker
1508
+ image: kubiya/agent-worker:latest
1509
+ imagePullPolicy: Always
1510
+ envFrom:
1511
+ - configMapRef:
1512
+ name: kubiya-worker-{queue_name}-config
1513
+ env:
1514
+ - name: KUBIYA_API_KEY
1515
+ valueFrom:
1516
+ secretKeyRef:
1517
+ name: kubiya-worker-secret
1518
+ key: api-key
1519
+ resources:
1520
+ requests:
1521
+ memory: "512Mi"
1522
+ cpu: "250m"
1523
+ limits:
1524
+ memory: "2Gi"
1525
+ cpu: "1000m"
1526
+ livenessProbe:
1527
+ httpGet:
1528
+ path: /health
1529
+ port: 8080
1530
+ initialDelaySeconds: 30
1531
+ periodSeconds: 30
1532
+ timeoutSeconds: 10
1533
+ failureThreshold: 3
1534
+ readinessProbe:
1535
+ httpGet:
1536
+ path: /health
1537
+ port: 8080
1538
+ initialDelaySeconds: 10
1539
+ periodSeconds: 10
1540
+ timeoutSeconds: 5
1541
+ failureThreshold: 3
1542
+ restartPolicy: Always
1543
+
1544
+ ---
1545
+ apiVersion: v1
1546
+ kind: Service
1547
+ metadata:
1548
+ name: kubiya-worker-{queue_name}
1549
+ labels:
1550
+ app: kubiya-worker
1551
+ queue: {queue_name}
1552
+ spec:
1553
+ selector:
1554
+ app: kubiya-worker
1555
+ queue: {queue_name}
1556
+ ports:
1557
+ - protocol: TCP
1558
+ port: 8080
1559
+ targetPort: 8080
1560
+ type: ClusterIP
1561
+
1562
+ ---
1563
+ # Optional: HorizontalPodAutoscaler
1564
+ # apiVersion: autoscaling/v2
1565
+ # kind: HorizontalPodAutoscaler
1566
+ # metadata:
1567
+ # name: kubiya-worker-{queue_name}
1568
+ # spec:
1569
+ # scaleTargetRef:
1570
+ # apiVersion: apps/v1
1571
+ # kind: Deployment
1572
+ # name: kubiya-worker-{queue_name}
1573
+ # minReplicas: 1
1574
+ # maxReplicas: 10
1575
+ # metrics:
1576
+ # - type: Resource
1577
+ # resource:
1578
+ # name: cpu
1579
+ # target:
1580
+ # type: Utilization
1581
+ # averageUtilization: 70
1582
+ """
1583
+
1584
+
1585
+ class WorkerQueueCommandResponse(BaseModel):
1586
+ """Worker queue connection command"""
1587
+ queue_id: str
1588
+ command: str
1589
+ command_parts: dict
1590
+ can_register: bool
1591
+ queue_status: str
1592
+ active_workers: int
1593
+ max_workers: Optional[int]
1594
+
1595
+
1596
+ class WorkerDetail(BaseModel):
1597
+ """Individual worker details"""
1598
+ id: str
1599
+ worker_id: str
1600
+ status: str
1601
+ tasks_processed: int
1602
+ current_task_id: Optional[str]
1603
+ last_heartbeat: str
1604
+ registered_at: str
1605
+ system_info: Optional[WorkerSystemInfo] = None
1606
+ logs: Optional[List[str]] = None
1607
+ worker_metadata: dict
1608
+
1609
+
1610
+ @router.get("/worker-queues/{queue_id}/workers", response_model=List[WorkerDetail])
1611
+ @instrument_endpoint("worker_queues.list_queue_workers")
1612
+ async def list_queue_workers(
1613
+ queue_id: str,
1614
+ request: Request,
1615
+ organization: dict = Depends(get_current_organization),
1616
+ db: Session = Depends(get_db),
1617
+ ):
1618
+ """
1619
+ List all workers for a specific queue with detailed information.
1620
+ """
1621
+ try:
1622
+ org_id = organization["id"]
1623
+
1624
+ # Get active workers from Redis for this queue
1625
+ active_workers = await get_active_workers_from_redis(org_id, queue_id, db=db)
1626
+
1627
+ # Get worker registration details from database (registered_at, worker_id, worker_metadata)
1628
+ if active_workers:
1629
+ db_workers = (
1630
+ db.query(WorkerHeartbeat)
1631
+ .filter(
1632
+ WorkerHeartbeat.organization_id == org_id,
1633
+ WorkerHeartbeat.id.in_(list(active_workers.keys()))
1634
+ )
1635
+ .all()
1636
+ )
1637
+ db_workers_map = {str(w.id): w for w in db_workers}
1638
+ else:
1639
+ db_workers_map = {}
1640
+
1641
+ workers = []
1642
+ for worker_id, heartbeat_data in active_workers.items():
1643
+ # Get DB data for registration time
1644
+ db_data = db_workers_map.get(worker_id, None)
1645
+
1646
+ # Extract system info and logs from Redis heartbeat data
1647
+ metadata = heartbeat_data.get("metadata", {})
1648
+ system_info_data = heartbeat_data.get("system_info")
1649
+ logs = heartbeat_data.get("logs", [])
1650
+
1651
+ # Fall back to worker_metadata from database if system_info not in Redis
1652
+ if not system_info_data and db_data and db_data.worker_metadata:
1653
+ system_info_data = db_data.worker_metadata
1654
+
1655
+ system_info = WorkerSystemInfo(**system_info_data) if system_info_data else None
1656
+
1657
+ workers.append(
1658
+ WorkerDetail(
1659
+ id=worker_id,
1660
+ worker_id=db_data.worker_id if db_data else worker_id,
1661
+ status=heartbeat_data.get("status", "unknown"),
1662
+ tasks_processed=heartbeat_data.get("tasks_processed", 0),
1663
+ current_task_id=heartbeat_data.get("current_task_id"),
1664
+ last_heartbeat=heartbeat_data.get("last_heartbeat", ""),
1665
+ registered_at=db_data.registered_at.isoformat() if db_data and db_data.registered_at else "",
1666
+ system_info=system_info,
1667
+ logs=logs,
1668
+ worker_metadata=metadata,
1669
+ )
1670
+ )
1671
+
1672
+ # Sort by last_heartbeat desc
1673
+ workers.sort(key=lambda w: w.last_heartbeat, reverse=True)
1674
+
1675
+ logger.info(
1676
+ "queue_workers_listed",
1677
+ queue_id=queue_id,
1678
+ worker_count=len(workers),
1679
+ org_id=org_id,
1680
+ )
1681
+
1682
+ return workers
1683
+
1684
+ except HTTPException:
1685
+ raise
1686
+ except Exception as e:
1687
+ logger.error("queue_workers_list_failed", error=str(e), queue_id=queue_id)
1688
+ raise HTTPException(
1689
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
1690
+ detail=f"Failed to list queue workers: {str(e)}"
1691
+ )
1692
+
1693
+
1694
+ @router.get("/worker-queues/{queue_id}/metrics", response_model=WorkerQueueMetricsResponse)
1695
+ @instrument_endpoint("worker_queues.get_worker_queue_metrics")
1696
+ async def get_worker_queue_metrics(
1697
+ queue_id: str,
1698
+ request: Request,
1699
+ organization: dict = Depends(get_current_organization),
1700
+ db: Session = Depends(get_db),
1701
+ ):
1702
+ """
1703
+ Get comprehensive metrics for a worker queue.
1704
+
1705
+ Returns worker health metrics, task statistics, and performance data.
1706
+ """
1707
+ try:
1708
+ org_id = organization["id"]
1709
+
1710
+ # Use service layer for business logic
1711
+ metrics_service = WorkerQueueMetricsService(db)
1712
+ metrics = await metrics_service.get_queue_metrics(queue_id, org_id)
1713
+
1714
+ logger.info(
1715
+ "queue_metrics_retrieved",
1716
+ queue_id=queue_id,
1717
+ org_id=org_id
1718
+ )
1719
+
1720
+ return metrics
1721
+
1722
+ except ValueError as e:
1723
+ raise HTTPException(
1724
+ status_code=status.HTTP_404_NOT_FOUND,
1725
+ detail=str(e)
1726
+ )
1727
+ except HTTPException:
1728
+ raise
1729
+ except Exception as e:
1730
+ logger.error(
1731
+ "queue_metrics_failed",
1732
+ error=str(e),
1733
+ queue_id=queue_id,
1734
+ org_id=org_id
1735
+ )
1736
+ raise HTTPException(
1737
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
1738
+ detail=f"Failed to get queue metrics: {str(e)}"
1739
+ )
1740
+
1741
+
1742
+ @router.get("/worker-queues/{queue_id}/workflows", response_model=WorkflowsListResponse)
1743
+ @instrument_endpoint("worker_queues.list_queue_workflows")
1744
+ async def list_queue_workflows(
1745
+ queue_id: str,
1746
+ request: Request,
1747
+ status_filter: Optional[str] = None,
1748
+ limit: int = 100,
1749
+ organization: dict = Depends(get_current_organization),
1750
+ db: Session = Depends(get_db),
1751
+ ):
1752
+ """
1753
+ List workflows/tasks for a worker queue.
1754
+
1755
+ Returns list of workflows with status counts and filtering options.
1756
+ """
1757
+ try:
1758
+ org_id = organization["id"]
1759
+
1760
+ # Import service here to avoid circular imports
1761
+ from control_plane_api.app.services.workflow_operations_service import WorkflowOperationsService
1762
+
1763
+ # Use service layer for business logic
1764
+ workflow_service = WorkflowOperationsService(db)
1765
+ workflows = await workflow_service.list_queue_workflows(
1766
+ queue_id=queue_id,
1767
+ organization_id=org_id,
1768
+ status_filter=status_filter,
1769
+ limit=limit
1770
+ )
1771
+
1772
+ logger.info(
1773
+ "queue_workflows_listed",
1774
+ queue_id=queue_id,
1775
+ total=workflows.total,
1776
+ org_id=org_id
1777
+ )
1778
+
1779
+ return workflows
1780
+
1781
+ except ValueError as e:
1782
+ raise HTTPException(
1783
+ status_code=status.HTTP_404_NOT_FOUND,
1784
+ detail=str(e)
1785
+ )
1786
+ except HTTPException:
1787
+ raise
1788
+ except Exception as e:
1789
+ logger.error(
1790
+ "queue_workflows_list_failed",
1791
+ error=str(e),
1792
+ queue_id=queue_id,
1793
+ org_id=org_id
1794
+ )
1795
+ raise HTTPException(
1796
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
1797
+ detail=f"Failed to list queue workflows: {str(e)}"
1798
+ )
1799
+
1800
+
1801
+ @router.get("/worker-queues/{queue_id}/worker-command", response_model=WorkerQueueCommandResponse)
1802
+ @instrument_endpoint("worker_queues.get_worker_queue_command")
1803
+ async def get_worker_queue_command(
1804
+ queue_id: str,
1805
+ request: Request,
1806
+ organization: dict = Depends(get_current_organization),
1807
+ db: Session = Depends(get_db),
1808
+ ):
1809
+ """
1810
+ Get the worker registration command for a specific worker queue.
1811
+
1812
+ Returns the kubiya worker start command with the queue ID that users
1813
+ should run to start a worker for this specific queue.
1814
+ """
1815
+ try:
1816
+ org_id = organization["id"]
1817
+
1818
+ # Get worker queue
1819
+ queue = (
1820
+ db.query(WorkerQueue)
1821
+ .filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
1822
+ .first()
1823
+ )
1824
+
1825
+ if not queue:
1826
+ raise HTTPException(status_code=404, detail="Worker queue not found")
1827
+
1828
+ queue_status = queue.status or "unknown"
1829
+
1830
+ # Check if queue is active
1831
+ can_register = queue_status == "active"
1832
+
1833
+ # Get active workers from Redis for this specific queue
1834
+ active_workers_dict = await get_active_workers_from_redis(org_id, queue_id, db=db)
1835
+ active_worker_count = len(active_workers_dict)
1836
+
1837
+ # Build command
1838
+ command = f"kubiya worker start --queue-id {queue_id}"
1839
+
1840
+ command_parts = {
1841
+ "binary": "kubiya",
1842
+ "subcommand": "worker start",
1843
+ "flags": {
1844
+ "--queue-id": queue_id,
1845
+ },
1846
+ }
1847
+
1848
+ logger.info(
1849
+ "worker_queue_command_retrieved",
1850
+ queue_id=queue_id,
1851
+ can_register=can_register,
1852
+ status=queue_status,
1853
+ active_workers=active_worker_count,
1854
+ org_id=org_id,
1855
+ )
1856
+
1857
+ return WorkerQueueCommandResponse(
1858
+ queue_id=queue_id,
1859
+ command=command,
1860
+ command_parts=command_parts,
1861
+ can_register=can_register,
1862
+ queue_status=queue_status,
1863
+ active_workers=active_worker_count,
1864
+ max_workers=queue.max_workers,
1865
+ )
1866
+
1867
+ except HTTPException:
1868
+ raise
1869
+ except Exception as e:
1870
+ logger.error("worker_queue_command_failed", error=str(e), queue_id=queue_id)
1871
+ raise HTTPException(
1872
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
1873
+ detail=f"Failed to get worker queue command: {str(e)}"
1874
+ )
1875
+
1876
+
1877
+ def _generate_openshift_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
1878
+ """Generate OpenShift deployment YAML"""
1879
+ return f"""# Kubiya Agent Worker - OpenShift Deployment
1880
+ # Generated: {datetime.now(timezone.utc).isoformat()}
1881
+ #
1882
+ # To deploy:
1883
+ # 1. Create secret: oc create secret generic kubiya-worker-secret --from-literal=api-key=YOUR_API_KEY
1884
+ # 2. Apply this file: oc apply -f kubiya-worker.yaml
1885
+ #
1886
+ ---
1887
+ apiVersion: v1
1888
+ kind: ConfigMap
1889
+ metadata:
1890
+ name: kubiya-worker-{queue_name}-config
1891
+ labels:
1892
+ app: kubiya-worker
1893
+ queue: {queue_name}
1894
+ environment: {environment_name}
1895
+ data:
1896
+ WORKER_ID: "{worker_id}"
1897
+ CONTROL_PLANE_URL: "{control_plane_url}"
1898
+ LOG_LEVEL: "INFO"
1899
+
1900
+ ---
1901
+ apiVersion: apps.openshift.io/v1
1902
+ kind: DeploymentConfig
1903
+ metadata:
1904
+ name: kubiya-worker-{queue_name}
1905
+ labels:
1906
+ app: kubiya-worker
1907
+ queue: {queue_name}
1908
+ environment: {environment_name}
1909
+ spec:
1910
+ replicas: 1
1911
+ selector:
1912
+ app: kubiya-worker
1913
+ queue: {queue_name}
1914
+ template:
1915
+ metadata:
1916
+ labels:
1917
+ app: kubiya-worker
1918
+ queue: {queue_name}
1919
+ environment: {environment_name}
1920
+ spec:
1921
+ containers:
1922
+ - name: worker
1923
+ image: kubiya/agent-worker:latest
1924
+ imagePullPolicy: Always
1925
+ envFrom:
1926
+ - configMapRef:
1927
+ name: kubiya-worker-{queue_name}-config
1928
+ env:
1929
+ - name: KUBIYA_API_KEY
1930
+ valueFrom:
1931
+ secretKeyRef:
1932
+ name: kubiya-worker-secret
1933
+ key: api-key
1934
+ resources:
1935
+ requests:
1936
+ memory: "512Mi"
1937
+ cpu: "250m"
1938
+ limits:
1939
+ memory: "2Gi"
1940
+ cpu: "1000m"
1941
+ livenessProbe:
1942
+ httpGet:
1943
+ path: /health
1944
+ port: 8080
1945
+ initialDelaySeconds: 30
1946
+ periodSeconds: 30
1947
+ timeoutSeconds: 10
1948
+ failureThreshold: 3
1949
+ readinessProbe:
1950
+ httpGet:
1951
+ path: /health
1952
+ port: 8080
1953
+ initialDelaySeconds: 10
1954
+ periodSeconds: 10
1955
+ timeoutSeconds: 5
1956
+ failureThreshold: 3
1957
+ restartPolicy: Always
1958
+ securityContext:
1959
+ runAsNonRoot: true
1960
+ runAsUser: 1000
1961
+ triggers:
1962
+ - type: ConfigChange
1963
+ - type: ImageChange
1964
+ imageChangeParams:
1965
+ automatic: true
1966
+ containerNames:
1967
+ - worker
1968
+ from:
1969
+ kind: ImageStreamTag
1970
+ name: agent-worker:latest
1971
+
1972
+ ---
1973
+ apiVersion: v1
1974
+ kind: Service
1975
+ metadata:
1976
+ name: kubiya-worker-{queue_name}
1977
+ labels:
1978
+ app: kubiya-worker
1979
+ queue: {queue_name}
1980
+ spec:
1981
+ selector:
1982
+ app: kubiya-worker
1983
+ queue: {queue_name}
1984
+ ports:
1985
+ - protocol: TCP
1986
+ port: 8080
1987
+ targetPort: 8080
1988
+ type: ClusterIP
1989
+
1990
+ ---
1991
+ # Optional: Route to expose the service
1992
+ # apiVersion: route.openshift.io/v1
1993
+ # kind: Route
1994
+ # metadata:
1995
+ # name: kubiya-worker-{queue_name}
1996
+ # labels:
1997
+ # app: kubiya-worker
1998
+ # queue: {queue_name}
1999
+ # spec:
2000
+ # to:
2001
+ # kind: Service
2002
+ # name: kubiya-worker-{queue_name}
2003
+ # port:
2004
+ # targetPort: 8080
2005
+ # tls:
2006
+ # termination: edge
2007
+ # insecureEdgeTerminationPolicy: Redirect
2008
+ """
2009
+
2010
+
2011
+ # ============================================================================
2012
+ # Worker Auto-Update Endpoints
2013
+ # ============================================================================
2014
+
2015
+
2016
+ class WorkerQueueConfigResponse(BaseModel):
2017
+ """Worker queue configuration with version tracking for auto-updates"""
2018
+ queue_id: str
2019
+ name: str
2020
+ display_name: Optional[str]
2021
+ description: Optional[str]
2022
+ status: str
2023
+ max_workers: Optional[int]
2024
+ heartbeat_interval: int
2025
+ tags: List[str]
2026
+ settings: dict
2027
+ config_version: str # SHA256 hash of configuration for change detection
2028
+ config_updated_at: str # Timestamp of last configuration change
2029
+ recommended_package_version: Optional[str] = None # Latest recommended worker package version
2030
+ environment_id: str
2031
+ environment_name: str
2032
+
2033
+
2034
+ class UpdateLockRequest(BaseModel):
2035
+ """Request to acquire an update lock for coordinated rolling updates"""
2036
+ worker_id: str
2037
+ lock_duration_seconds: int = Field(default=300, ge=60, le=600, description="Lock TTL (60-600 seconds)")
2038
+
2039
+
2040
+ class UpdateLockResponse(BaseModel):
2041
+ """Response with update lock information"""
2042
+ lock_id: str
2043
+ worker_id: str
2044
+ queue_id: str
2045
+ acquired_at: str
2046
+ expires_at: str
2047
+ locked: bool
2048
+
2049
+
2050
+ def _compute_config_hash(queue: dict) -> str:
2051
+ """
2052
+ Compute SHA256 hash of worker queue configuration.
2053
+
2054
+ This hash is used to detect configuration changes for auto-updates.
2055
+ Only includes fields that affect worker behavior.
2056
+ """
2057
+ config_data = {
2058
+ "name": queue.get("name"),
2059
+ "status": queue.get("status"),
2060
+ "max_workers": queue.get("max_workers"),
2061
+ "heartbeat_interval": queue.get("heartbeat_interval"),
2062
+ "tags": sorted(queue.get("tags", [])), # Sort for consistency
2063
+ "settings": queue.get("settings", {}),
2064
+ }
2065
+
2066
+ # Serialize to JSON with sorted keys for consistent hashing
2067
+ config_json = json.dumps(config_data, sort_keys=True)
2068
+ return hashlib.sha256(config_json.encode()).hexdigest()
2069
+
2070
+
2071
+ @router.get("/worker-queues/{queue_id}/config", response_model=WorkerQueueConfigResponse)
2072
+ @instrument_endpoint("worker_queues.get_worker_queue_config")
2073
+ async def get_worker_queue_config(
2074
+ queue_id: str,
2075
+ request: Request,
2076
+ organization: dict = Depends(get_current_organization),
2077
+ db: Session = Depends(get_db),
2078
+ ):
2079
+ """
2080
+ Get worker queue configuration with version tracking for auto-updates.
2081
+
2082
+ This endpoint is called by CLI workers periodically to check for configuration changes.
2083
+ The config_version hash allows workers to detect when they need to reload.
2084
+
2085
+ Args:
2086
+ queue_id: Worker queue ID
2087
+
2088
+ Returns:
2089
+ Configuration with version hash and recommended package version
2090
+ """
2091
+ try:
2092
+ org_id = organization["id"]
2093
+
2094
+ # Get worker queue with environment relationship
2095
+ queue = (
2096
+ db.query(WorkerQueue)
2097
+ .options(joinedload(WorkerQueue.environment))
2098
+ .filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
2099
+ .first()
2100
+ )
2101
+
2102
+ if not queue:
2103
+ raise HTTPException(
2104
+ status_code=status.HTTP_404_NOT_FOUND,
2105
+ detail="Worker queue not found"
2106
+ )
2107
+
2108
+ # Get environment name from relationship
2109
+ environment_name = queue.environment.name if queue.environment else "unknown"
2110
+
2111
+ # Convert queue to dict for config hash computation
2112
+ from sqlalchemy.inspection import inspect
2113
+ queue_dict = {c.key: getattr(queue, c.key) for c in inspect(queue).mapper.column_attrs}
2114
+
2115
+ # Compute configuration hash for change detection
2116
+ config_version = _compute_config_hash(queue_dict)
2117
+
2118
+ # Get recommended package version from control plane settings or PyPI
2119
+ # This can be configured via environment variable or fetched from PyPI
2120
+ recommended_package_version = os.getenv("KUBIYA_RECOMMENDED_WORKER_VERSION")
2121
+ if not recommended_package_version:
2122
+ # Fetch latest version from PyPI (cached for performance)
2123
+ try:
2124
+ import httpx
2125
+ response = httpx.get("https://pypi.org/pypi/kubiya-control-plane-api/json", timeout=5.0)
2126
+ if response.status_code == 200:
2127
+ pypi_data = response.json()
2128
+ recommended_package_version = pypi_data.get("info", {}).get("version")
2129
+ except Exception as e:
2130
+ logger.warning(
2131
+ "failed_to_fetch_pypi_version",
2132
+ error=str(e),
2133
+ queue_id=queue_id,
2134
+ )
2135
+ # Fallback: no recommendation if PyPI fetch fails
2136
+ recommended_package_version = None
2137
+
2138
+ logger.info(
2139
+ "worker_queue_config_fetched",
2140
+ queue_id=queue_id,
2141
+ config_version=config_version[:8], # Log first 8 chars of hash
2142
+ org_id=org_id,
2143
+ )
2144
+
2145
+ return WorkerQueueConfigResponse(
2146
+ queue_id=queue_id,
2147
+ name=queue.name,
2148
+ display_name=queue.display_name,
2149
+ description=queue.description,
2150
+ status=queue.status,
2151
+ max_workers=queue.max_workers,
2152
+ heartbeat_interval=queue.heartbeat_interval or 60,
2153
+ tags=queue.tags or [],
2154
+ settings=queue.settings or {},
2155
+ config_version=config_version,
2156
+ config_updated_at=queue.updated_at.isoformat() if queue.updated_at else queue.created_at.isoformat(),
2157
+ recommended_package_version=recommended_package_version,
2158
+ environment_id=str(queue.environment_id),
2159
+ environment_name=environment_name,
2160
+ )
2161
+
2162
+ except HTTPException:
2163
+ raise
2164
+ except Exception as e:
2165
+ logger.error("worker_queue_config_fetch_failed", error=str(e), queue_id=queue_id)
2166
+ raise HTTPException(
2167
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
2168
+ detail=f"Failed to fetch worker queue config: {str(e)}"
2169
+ )
2170
+
2171
+
2172
+ @router.post("/worker-queues/{queue_id}/workers/{worker_id}/update-lock", response_model=UpdateLockResponse)
2173
+ @instrument_endpoint("worker_queues.acquire_update_lock")
2174
+ async def acquire_update_lock(
2175
+ queue_id: str,
2176
+ worker_id: str,
2177
+ lock_request: UpdateLockRequest,
2178
+ request: Request,
2179
+ organization: dict = Depends(get_current_organization),
2180
+ db: Session = Depends(get_db),
2181
+ ):
2182
+ """
2183
+ Acquire an update lock for coordinated rolling updates.
2184
+
2185
+ This ensures only one worker in a queue updates at a time.
2186
+ Uses Redis for distributed locking with automatic TTL expiration.
2187
+
2188
+ Args:
2189
+ queue_id: Worker queue ID
2190
+ worker_id: Worker ID requesting the lock
2191
+ lock_request: Lock configuration (duration)
2192
+
2193
+ Returns:
2194
+ Lock information if acquired, or error if another worker holds the lock
2195
+ """
2196
+ try:
2197
+ org_id = organization["id"]
2198
+ redis_client = get_redis_client()
2199
+
2200
+ if not redis_client:
2201
+ raise HTTPException(
2202
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
2203
+ detail="Update coordination unavailable (Redis not configured)"
2204
+ )
2205
+
2206
+ # Verify queue exists and worker belongs to this queue
2207
+ queue = (
2208
+ db.query(WorkerQueue)
2209
+ .filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
2210
+ .first()
2211
+ )
2212
+
2213
+ if not queue:
2214
+ raise HTTPException(
2215
+ status_code=status.HTTP_404_NOT_FOUND,
2216
+ detail="Worker queue not found"
2217
+ )
2218
+
2219
+ # Check if worker exists (optional - for validation)
2220
+ worker_heartbeat_key = f"worker:{worker_id}:heartbeat"
2221
+ worker_data = await redis_client.get(worker_heartbeat_key)
2222
+
2223
+ if not worker_data:
2224
+ logger.warning(
2225
+ "worker_not_found_in_heartbeats",
2226
+ worker_id=worker_id,
2227
+ queue_id=queue_id,
2228
+ org_id=org_id,
2229
+ )
2230
+
2231
+ # Try to acquire lock using Redis SET NX (set if not exists)
2232
+ lock_key = f"worker_queue:{queue_id}:update_lock"
2233
+ lock_id = str(uuid.uuid4())
2234
+ now = datetime.now(timezone.utc)
2235
+ expires_at = now + timedelta(seconds=lock_request.lock_duration_seconds)
2236
+
2237
+ lock_data = {
2238
+ "lock_id": lock_id,
2239
+ "worker_id": worker_id,
2240
+ "queue_id": queue_id,
2241
+ "organization_id": org_id,
2242
+ "acquired_at": now.isoformat(),
2243
+ "expires_at": expires_at.isoformat(),
2244
+ }
2245
+
2246
+ # SET NX EX: Set if not exists with expiration
2247
+ acquired = await redis_client.set(
2248
+ lock_key,
2249
+ json.dumps(lock_data),
2250
+ ex=lock_request.lock_duration_seconds,
2251
+ nx=True, # Only set if key doesn't exist
2252
+ )
2253
+
2254
+ if not acquired:
2255
+ # Lock already held by another worker
2256
+ existing_lock_data = await redis_client.get(lock_key)
2257
+ if existing_lock_data:
2258
+ existing_lock = json.loads(existing_lock_data)
2259
+ logger.info(
2260
+ "update_lock_already_held",
2261
+ queue_id=queue_id,
2262
+ requesting_worker=worker_id,
2263
+ lock_holder=existing_lock.get("worker_id"),
2264
+ org_id=org_id,
2265
+ )
2266
+ raise HTTPException(
2267
+ status_code=status.HTTP_409_CONFLICT,
2268
+ detail=f"Update lock already held by worker {existing_lock.get('worker_id')}"
2269
+ )
2270
+ else:
2271
+ # Race condition: lock was released between check and get
2272
+ logger.warning("update_lock_race_condition", queue_id=queue_id, worker_id=worker_id)
2273
+ raise HTTPException(
2274
+ status_code=status.HTTP_409_CONFLICT,
2275
+ detail="Failed to acquire lock due to race condition, please retry"
2276
+ )
2277
+
2278
+ logger.info(
2279
+ "update_lock_acquired",
2280
+ lock_id=lock_id,
2281
+ worker_id=worker_id,
2282
+ queue_id=queue_id,
2283
+ duration_seconds=lock_request.lock_duration_seconds,
2284
+ org_id=org_id,
2285
+ )
2286
+
2287
+ return UpdateLockResponse(
2288
+ lock_id=lock_id,
2289
+ worker_id=worker_id,
2290
+ queue_id=queue_id,
2291
+ acquired_at=now.isoformat(),
2292
+ expires_at=expires_at.isoformat(),
2293
+ locked=True,
2294
+ )
2295
+
2296
+ except HTTPException:
2297
+ raise
2298
+ except Exception as e:
2299
+ logger.error(
2300
+ "update_lock_acquisition_failed",
2301
+ error=str(e),
2302
+ queue_id=queue_id,
2303
+ worker_id=worker_id,
2304
+ )
2305
+ raise HTTPException(
2306
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
2307
+ detail=f"Failed to acquire update lock: {str(e)}"
2308
+ )
2309
+
2310
+
2311
+ @router.delete("/worker-queues/{queue_id}/workers/{worker_id}/update-lock", status_code=status.HTTP_204_NO_CONTENT)
2312
+ @instrument_endpoint("worker_queues.release_update_lock")
2313
+ async def release_update_lock(
2314
+ queue_id: str,
2315
+ worker_id: str,
2316
+ request: Request,
2317
+ organization: dict = Depends(get_current_organization),
2318
+ ):
2319
+ """
2320
+ Release an update lock after worker has completed its update.
2321
+
2322
+ Only the worker that acquired the lock can release it (verified by worker_id).
2323
+
2324
+ Args:
2325
+ queue_id: Worker queue ID
2326
+ worker_id: Worker ID that holds the lock
2327
+ """
2328
+ try:
2329
+ org_id = organization["id"]
2330
+ redis_client = get_redis_client()
2331
+
2332
+ if not redis_client:
2333
+ # If Redis is unavailable, just return success (lock will expire naturally)
2334
+ logger.warning(
2335
+ "redis_unavailable_for_lock_release",
2336
+ queue_id=queue_id,
2337
+ worker_id=worker_id,
2338
+ org_id=org_id,
2339
+ )
2340
+ return None
2341
+
2342
+ lock_key = f"worker_queue:{queue_id}:update_lock"
2343
+
2344
+ # Get current lock to verify ownership
2345
+ lock_data_str = await redis_client.get(lock_key)
2346
+
2347
+ if not lock_data_str:
2348
+ # Lock doesn't exist (already expired or never acquired)
2349
+ logger.info(
2350
+ "update_lock_not_found",
2351
+ queue_id=queue_id,
2352
+ worker_id=worker_id,
2353
+ org_id=org_id,
2354
+ )
2355
+ return None
2356
+
2357
+ lock_data = json.loads(lock_data_str)
2358
+
2359
+ # Verify lock is held by this worker
2360
+ if lock_data.get("worker_id") != worker_id:
2361
+ logger.warning(
2362
+ "update_lock_ownership_mismatch",
2363
+ queue_id=queue_id,
2364
+ requesting_worker=worker_id,
2365
+ lock_holder=lock_data.get("worker_id"),
2366
+ org_id=org_id,
2367
+ )
2368
+ raise HTTPException(
2369
+ status_code=status.HTTP_403_FORBIDDEN,
2370
+ detail=f"Lock is held by another worker ({lock_data.get('worker_id')})"
2371
+ )
2372
+
2373
+ # Release the lock
2374
+ await redis_client.delete(lock_key)
2375
+
2376
+ logger.info(
2377
+ "update_lock_released",
2378
+ lock_id=lock_data.get("lock_id"),
2379
+ worker_id=worker_id,
2380
+ queue_id=queue_id,
2381
+ org_id=org_id,
2382
+ )
2383
+
2384
+ return None
2385
+
2386
+ except HTTPException:
2387
+ raise
2388
+ except Exception as e:
2389
+ logger.error(
2390
+ "update_lock_release_failed",
2391
+ error=str(e),
2392
+ queue_id=queue_id,
2393
+ worker_id=worker_id,
2394
+ )
2395
+ raise HTTPException(
2396
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
2397
+ detail=f"Failed to release update lock: {str(e)}"
2398
+ )
2399
+
2400
+
2401
+ @router.get("/worker-queues/{queue_id}/update-lock-status")
2402
+ @instrument_endpoint("worker_queues.get_update_lock_status")
2403
+ async def get_update_lock_status(
2404
+ queue_id: str,
2405
+ request: Request,
2406
+ organization: dict = Depends(get_current_organization),
2407
+ ):
2408
+ """
2409
+ Get the current update lock status for a queue.
2410
+
2411
+ Useful for checking if updates are in progress before triggering manual updates.
2412
+
2413
+ Args:
2414
+ queue_id: Worker queue ID
2415
+
2416
+ Returns:
2417
+ Lock status (locked/unlocked) and lock holder if locked
2418
+ """
2419
+ try:
2420
+ org_id = organization["id"]
2421
+ redis_client = get_redis_client()
2422
+
2423
+ if not redis_client:
2424
+ return {
2425
+ "locked": False,
2426
+ "lock_coordination_available": False,
2427
+ "message": "Lock coordination unavailable (Redis not configured)",
2428
+ }
2429
+
2430
+ lock_key = f"worker_queue:{queue_id}:update_lock"
2431
+ lock_data_str = await redis_client.get(lock_key)
2432
+
2433
+ if not lock_data_str:
2434
+ return {
2435
+ "locked": False,
2436
+ "queue_id": queue_id,
2437
+ "lock_coordination_available": True,
2438
+ }
2439
+
2440
+ lock_data = json.loads(lock_data_str)
2441
+
2442
+ # Get TTL for expiration info
2443
+ ttl = await redis_client.ttl(lock_key)
2444
+
2445
+ return {
2446
+ "locked": True,
2447
+ "queue_id": queue_id,
2448
+ "worker_id": lock_data.get("worker_id"),
2449
+ "lock_id": lock_data.get("lock_id"),
2450
+ "acquired_at": lock_data.get("acquired_at"),
2451
+ "expires_at": lock_data.get("expires_at"),
2452
+ "ttl_seconds": ttl if ttl > 0 else 0,
2453
+ "lock_coordination_available": True,
2454
+ }
2455
+
2456
+ except Exception as e:
2457
+ logger.error(
2458
+ "update_lock_status_check_failed",
2459
+ error=str(e),
2460
+ queue_id=queue_id,
2461
+ )
2462
+ raise HTTPException(
2463
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
2464
+ detail=f"Failed to check lock status: {str(e)}"
2465
+ )
2466
+
2467
+
2468
+ @router.get("/worker-queues/{queue_id}/executions")
2469
+ @instrument_endpoint("worker_queues.list_queue_executions")
2470
+ async def list_queue_executions(
2471
+ queue_id: str,
2472
+ request: Request,
2473
+ limit: int = 10,
2474
+ status: str = "all",
2475
+ organization: dict = Depends(get_current_organization),
2476
+ db: Session = Depends(get_db),
2477
+ ):
2478
+ """
2479
+ List recent executions for a specific worker queue.
2480
+
2481
+ Used by workers in single-execution mode to monitor when their task completes.
2482
+
2483
+ Args:
2484
+ queue_id: Worker queue ID
2485
+ limit: Maximum number of executions to return (default: 10)
2486
+ status: Filter by status ('all', 'running', 'completed', 'failed', etc.)
2487
+
2488
+ Returns:
2489
+ List of executions for this queue
2490
+ """
2491
+ try:
2492
+ org_id = organization["id"]
2493
+
2494
+ # Verify queue exists and belongs to this org
2495
+ queue = (
2496
+ db.query(WorkerQueue)
2497
+ .filter(WorkerQueue.id == queue_id, WorkerQueue.organization_id == org_id)
2498
+ .first()
2499
+ )
2500
+
2501
+ if not queue:
2502
+ raise HTTPException(
2503
+ status_code=status.HTTP_404_NOT_FOUND,
2504
+ detail="Worker queue not found"
2505
+ )
2506
+
2507
+ # Import Execution model
2508
+ from control_plane_api.app.models.execution import Execution
2509
+
2510
+ # Query executions for this queue
2511
+ query = db.query(Execution).filter(
2512
+ Execution.organization_id == org_id,
2513
+ Execution.worker_queue_id == queue_id
2514
+ )
2515
+
2516
+ # Filter by status if not 'all'
2517
+ if status != "all":
2518
+ query = query.filter(Execution.status == status)
2519
+
2520
+ # Order by created_at descending and limit
2521
+ executions = query.order_by(desc(Execution.created_at)).limit(limit).all()
2522
+
2523
+ # Convert to dict for JSON response
2524
+ result = []
2525
+ for execution in executions:
2526
+ result.append({
2527
+ "id": str(execution.id),
2528
+ "status": execution.status,
2529
+ "entity_id": str(execution.entity_id),
2530
+ "entity_name": execution.entity_name,
2531
+ "execution_type": execution.execution_type,
2532
+ "prompt": execution.prompt[:200] if execution.prompt else None, # Truncate for brevity
2533
+ "created_at": execution.created_at.isoformat() if execution.created_at else None,
2534
+ "started_at": execution.started_at.isoformat() if execution.started_at else None,
2535
+ "completed_at": execution.completed_at.isoformat() if execution.completed_at else None,
2536
+ "temporal_workflow_id": execution.temporal_workflow_id,
2537
+ })
2538
+
2539
+ logger.info(
2540
+ "queue_executions_listed",
2541
+ queue_id=queue_id,
2542
+ count=len(result),
2543
+ org_id=org_id,
2544
+ )
2545
+
2546
+ return result
2547
+
2548
+ except HTTPException:
2549
+ raise
2550
+ except Exception as e:
2551
+ logger.error("queue_executions_list_failed", error=str(e), queue_id=queue_id)
2552
+ raise HTTPException(
2553
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
2554
+ detail=f"Failed to list queue executions: {str(e)}"
2555
+ )