kubiya-control-plane-api 0.1.0__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubiya-control-plane-api might be problematic. Click here for more details.

Files changed (185) hide show
  1. control_plane_api/README.md +266 -0
  2. control_plane_api/__init__.py +0 -0
  3. control_plane_api/__version__.py +1 -0
  4. control_plane_api/alembic/README +1 -0
  5. control_plane_api/alembic/env.py +98 -0
  6. control_plane_api/alembic/script.py.mako +28 -0
  7. control_plane_api/alembic/versions/1382bec74309_initial_migration_with_all_models.py +251 -0
  8. control_plane_api/alembic/versions/1f54bc2a37e3_add_analytics_tables.py +162 -0
  9. control_plane_api/alembic/versions/2e4cb136dc10_rename_toolset_ids_to_skill_ids_in_teams.py +30 -0
  10. control_plane_api/alembic/versions/31cd69a644ce_add_skill_templates_table.py +28 -0
  11. control_plane_api/alembic/versions/89e127caa47d_add_jobs_and_job_executions_tables.py +161 -0
  12. control_plane_api/alembic/versions/add_llm_models_table.py +51 -0
  13. control_plane_api/alembic/versions/b0e10697f212_add_runtime_column_to_teams_simple.py +42 -0
  14. control_plane_api/alembic/versions/ce43b24b63bf_add_execution_trigger_source_and_fix_.py +155 -0
  15. control_plane_api/alembic/versions/d4eaf16e3f8d_rename_toolsets_to_skills.py +84 -0
  16. control_plane_api/alembic/versions/efa2dc427da1_rename_metadata_to_custom_metadata.py +32 -0
  17. control_plane_api/alembic/versions/f973b431d1ce_add_workflow_executor_to_skill_types.py +44 -0
  18. control_plane_api/alembic.ini +148 -0
  19. control_plane_api/api/index.py +12 -0
  20. control_plane_api/app/__init__.py +11 -0
  21. control_plane_api/app/activities/__init__.py +20 -0
  22. control_plane_api/app/activities/agent_activities.py +379 -0
  23. control_plane_api/app/activities/team_activities.py +410 -0
  24. control_plane_api/app/activities/temporal_cloud_activities.py +577 -0
  25. control_plane_api/app/config/__init__.py +35 -0
  26. control_plane_api/app/config/api_config.py +354 -0
  27. control_plane_api/app/config/model_pricing.py +318 -0
  28. control_plane_api/app/config.py +95 -0
  29. control_plane_api/app/database.py +135 -0
  30. control_plane_api/app/exceptions.py +408 -0
  31. control_plane_api/app/lib/__init__.py +11 -0
  32. control_plane_api/app/lib/job_executor.py +312 -0
  33. control_plane_api/app/lib/kubiya_client.py +235 -0
  34. control_plane_api/app/lib/litellm_pricing.py +166 -0
  35. control_plane_api/app/lib/planning_tools/__init__.py +22 -0
  36. control_plane_api/app/lib/planning_tools/agents.py +155 -0
  37. control_plane_api/app/lib/planning_tools/base.py +189 -0
  38. control_plane_api/app/lib/planning_tools/environments.py +214 -0
  39. control_plane_api/app/lib/planning_tools/resources.py +240 -0
  40. control_plane_api/app/lib/planning_tools/teams.py +198 -0
  41. control_plane_api/app/lib/policy_enforcer_client.py +939 -0
  42. control_plane_api/app/lib/redis_client.py +436 -0
  43. control_plane_api/app/lib/supabase.py +71 -0
  44. control_plane_api/app/lib/temporal_client.py +138 -0
  45. control_plane_api/app/lib/validation/__init__.py +20 -0
  46. control_plane_api/app/lib/validation/runtime_validation.py +287 -0
  47. control_plane_api/app/main.py +128 -0
  48. control_plane_api/app/middleware/__init__.py +8 -0
  49. control_plane_api/app/middleware/auth.py +513 -0
  50. control_plane_api/app/middleware/exception_handler.py +267 -0
  51. control_plane_api/app/middleware/rate_limiting.py +384 -0
  52. control_plane_api/app/middleware/request_id.py +202 -0
  53. control_plane_api/app/models/__init__.py +27 -0
  54. control_plane_api/app/models/agent.py +79 -0
  55. control_plane_api/app/models/analytics.py +206 -0
  56. control_plane_api/app/models/associations.py +81 -0
  57. control_plane_api/app/models/environment.py +63 -0
  58. control_plane_api/app/models/execution.py +93 -0
  59. control_plane_api/app/models/job.py +179 -0
  60. control_plane_api/app/models/llm_model.py +75 -0
  61. control_plane_api/app/models/presence.py +49 -0
  62. control_plane_api/app/models/project.py +47 -0
  63. control_plane_api/app/models/session.py +38 -0
  64. control_plane_api/app/models/team.py +66 -0
  65. control_plane_api/app/models/workflow.py +55 -0
  66. control_plane_api/app/policies/README.md +121 -0
  67. control_plane_api/app/policies/approved_users.rego +62 -0
  68. control_plane_api/app/policies/business_hours.rego +51 -0
  69. control_plane_api/app/policies/rate_limiting.rego +100 -0
  70. control_plane_api/app/policies/tool_restrictions.rego +86 -0
  71. control_plane_api/app/routers/__init__.py +4 -0
  72. control_plane_api/app/routers/agents.py +364 -0
  73. control_plane_api/app/routers/agents_v2.py +1260 -0
  74. control_plane_api/app/routers/analytics.py +1014 -0
  75. control_plane_api/app/routers/context_manager.py +562 -0
  76. control_plane_api/app/routers/environment_context.py +270 -0
  77. control_plane_api/app/routers/environments.py +715 -0
  78. control_plane_api/app/routers/execution_environment.py +517 -0
  79. control_plane_api/app/routers/executions.py +1911 -0
  80. control_plane_api/app/routers/health.py +92 -0
  81. control_plane_api/app/routers/health_v2.py +326 -0
  82. control_plane_api/app/routers/integrations.py +274 -0
  83. control_plane_api/app/routers/jobs.py +1344 -0
  84. control_plane_api/app/routers/models.py +82 -0
  85. control_plane_api/app/routers/models_v2.py +361 -0
  86. control_plane_api/app/routers/policies.py +639 -0
  87. control_plane_api/app/routers/presence.py +234 -0
  88. control_plane_api/app/routers/projects.py +902 -0
  89. control_plane_api/app/routers/runners.py +379 -0
  90. control_plane_api/app/routers/runtimes.py +172 -0
  91. control_plane_api/app/routers/secrets.py +155 -0
  92. control_plane_api/app/routers/skills.py +1001 -0
  93. control_plane_api/app/routers/skills_definitions.py +140 -0
  94. control_plane_api/app/routers/task_planning.py +1256 -0
  95. control_plane_api/app/routers/task_queues.py +654 -0
  96. control_plane_api/app/routers/team_context.py +270 -0
  97. control_plane_api/app/routers/teams.py +1400 -0
  98. control_plane_api/app/routers/worker_queues.py +1545 -0
  99. control_plane_api/app/routers/workers.py +935 -0
  100. control_plane_api/app/routers/workflows.py +204 -0
  101. control_plane_api/app/runtimes/__init__.py +6 -0
  102. control_plane_api/app/runtimes/validation.py +344 -0
  103. control_plane_api/app/schemas/job_schemas.py +295 -0
  104. control_plane_api/app/services/__init__.py +1 -0
  105. control_plane_api/app/services/agno_service.py +619 -0
  106. control_plane_api/app/services/litellm_service.py +190 -0
  107. control_plane_api/app/services/policy_service.py +525 -0
  108. control_plane_api/app/services/temporal_cloud_provisioning.py +150 -0
  109. control_plane_api/app/skills/__init__.py +44 -0
  110. control_plane_api/app/skills/base.py +229 -0
  111. control_plane_api/app/skills/business_intelligence.py +189 -0
  112. control_plane_api/app/skills/data_visualization.py +154 -0
  113. control_plane_api/app/skills/docker.py +104 -0
  114. control_plane_api/app/skills/file_generation.py +94 -0
  115. control_plane_api/app/skills/file_system.py +110 -0
  116. control_plane_api/app/skills/python.py +92 -0
  117. control_plane_api/app/skills/registry.py +65 -0
  118. control_plane_api/app/skills/shell.py +102 -0
  119. control_plane_api/app/skills/workflow_executor.py +469 -0
  120. control_plane_api/app/utils/workflow_executor.py +354 -0
  121. control_plane_api/app/workflows/__init__.py +11 -0
  122. control_plane_api/app/workflows/agent_execution.py +507 -0
  123. control_plane_api/app/workflows/agent_execution_with_skills.py +222 -0
  124. control_plane_api/app/workflows/namespace_provisioning.py +326 -0
  125. control_plane_api/app/workflows/team_execution.py +399 -0
  126. control_plane_api/scripts/seed_models.py +239 -0
  127. control_plane_api/worker/__init__.py +0 -0
  128. control_plane_api/worker/activities/__init__.py +0 -0
  129. control_plane_api/worker/activities/agent_activities.py +1241 -0
  130. control_plane_api/worker/activities/approval_activities.py +234 -0
  131. control_plane_api/worker/activities/runtime_activities.py +388 -0
  132. control_plane_api/worker/activities/skill_activities.py +267 -0
  133. control_plane_api/worker/activities/team_activities.py +1217 -0
  134. control_plane_api/worker/config/__init__.py +31 -0
  135. control_plane_api/worker/config/worker_config.py +275 -0
  136. control_plane_api/worker/control_plane_client.py +529 -0
  137. control_plane_api/worker/examples/analytics_integration_example.py +362 -0
  138. control_plane_api/worker/models/__init__.py +1 -0
  139. control_plane_api/worker/models/inputs.py +89 -0
  140. control_plane_api/worker/runtimes/__init__.py +31 -0
  141. control_plane_api/worker/runtimes/base.py +789 -0
  142. control_plane_api/worker/runtimes/claude_code_runtime.py +1443 -0
  143. control_plane_api/worker/runtimes/default_runtime.py +617 -0
  144. control_plane_api/worker/runtimes/factory.py +173 -0
  145. control_plane_api/worker/runtimes/validation.py +93 -0
  146. control_plane_api/worker/services/__init__.py +1 -0
  147. control_plane_api/worker/services/agent_executor.py +422 -0
  148. control_plane_api/worker/services/agent_executor_v2.py +383 -0
  149. control_plane_api/worker/services/analytics_collector.py +457 -0
  150. control_plane_api/worker/services/analytics_service.py +464 -0
  151. control_plane_api/worker/services/approval_tools.py +310 -0
  152. control_plane_api/worker/services/approval_tools_agno.py +207 -0
  153. control_plane_api/worker/services/cancellation_manager.py +177 -0
  154. control_plane_api/worker/services/data_visualization.py +827 -0
  155. control_plane_api/worker/services/jira_tools.py +257 -0
  156. control_plane_api/worker/services/runtime_analytics.py +328 -0
  157. control_plane_api/worker/services/session_service.py +194 -0
  158. control_plane_api/worker/services/skill_factory.py +175 -0
  159. control_plane_api/worker/services/team_executor.py +574 -0
  160. control_plane_api/worker/services/team_executor_v2.py +465 -0
  161. control_plane_api/worker/services/workflow_executor_tools.py +1418 -0
  162. control_plane_api/worker/tests/__init__.py +1 -0
  163. control_plane_api/worker/tests/e2e/__init__.py +0 -0
  164. control_plane_api/worker/tests/e2e/test_execution_flow.py +571 -0
  165. control_plane_api/worker/tests/integration/__init__.py +0 -0
  166. control_plane_api/worker/tests/integration/test_control_plane_integration.py +308 -0
  167. control_plane_api/worker/tests/unit/__init__.py +0 -0
  168. control_plane_api/worker/tests/unit/test_control_plane_client.py +401 -0
  169. control_plane_api/worker/utils/__init__.py +1 -0
  170. control_plane_api/worker/utils/chunk_batcher.py +305 -0
  171. control_plane_api/worker/utils/retry_utils.py +60 -0
  172. control_plane_api/worker/utils/streaming_utils.py +373 -0
  173. control_plane_api/worker/worker.py +753 -0
  174. control_plane_api/worker/workflows/__init__.py +0 -0
  175. control_plane_api/worker/workflows/agent_execution.py +589 -0
  176. control_plane_api/worker/workflows/team_execution.py +429 -0
  177. kubiya_control_plane_api-0.3.4.dist-info/METADATA +229 -0
  178. kubiya_control_plane_api-0.3.4.dist-info/RECORD +182 -0
  179. kubiya_control_plane_api-0.3.4.dist-info/entry_points.txt +2 -0
  180. kubiya_control_plane_api-0.3.4.dist-info/top_level.txt +1 -0
  181. kubiya_control_plane_api-0.1.0.dist-info/METADATA +0 -66
  182. kubiya_control_plane_api-0.1.0.dist-info/RECORD +0 -5
  183. kubiya_control_plane_api-0.1.0.dist-info/top_level.txt +0 -1
  184. {kubiya_control_plane_api-0.1.0.dist-info/licenses → control_plane_api}/LICENSE +0 -0
  185. {kubiya_control_plane_api-0.1.0.dist-info → kubiya_control_plane_api-0.3.4.dist-info}/WHEEL +0 -0
@@ -0,0 +1,935 @@
1
+ """Workers endpoint - shows registered Temporal workers and handles worker registration"""
2
+
3
+ from fastapi import APIRouter, Depends, HTTPException, status, Request
4
+ from typing import List, Dict, Any, Optional
5
+ from pydantic import BaseModel
6
+ from datetime import datetime
7
+ import structlog
8
+ import uuid
9
+ import json
10
+
11
+ from control_plane_api.app.middleware.auth import get_current_organization
12
+ from control_plane_api.app.lib.temporal_client import get_temporal_client
13
+ from control_plane_api.app.lib.supabase import get_supabase
14
+ from control_plane_api.app.lib.redis_client import get_redis_client
15
+
16
+ logger = structlog.get_logger()
17
+
18
+ router = APIRouter()
19
+
20
+
21
+ class WorkerInfo(BaseModel):
22
+ """Worker information"""
23
+ identity: str
24
+ last_access_time: str | None
25
+ rate_per_second: float | None
26
+
27
+
28
+ class TaskQueueInfo(BaseModel):
29
+ """Task queue with worker information"""
30
+ task_queue: str
31
+ organization_id: str
32
+ runner_name: str
33
+ workers: List[WorkerInfo]
34
+ worker_count: int
35
+ approximate_backlog_count: int | None
36
+
37
+
38
+ @router.get("", response_model=List[TaskQueueInfo])
39
+ async def list_workers(
40
+ request: Request,
41
+ organization: dict = Depends(get_current_organization),
42
+ ):
43
+ """
44
+ List registered Temporal workers for the organization.
45
+
46
+ This queries Temporal to get all task queues for the organization
47
+ and returns information about registered workers on each queue.
48
+
49
+ Task queue naming convention: {organization_id}.{runner_name}
50
+ """
51
+ try:
52
+ temporal_client = await get_temporal_client()
53
+ org_id = organization["id"]
54
+
55
+ # Get runners from Kubiya API to know which task queues to check
56
+ from control_plane_api.app.lib.kubiya_client import get_kubiya_client
57
+ kubiya_client = get_kubiya_client()
58
+ token = request.state.kubiya_token
59
+
60
+ try:
61
+ runners = await kubiya_client.get_runners(token, org_id)
62
+ except Exception as e:
63
+ logger.warning(
64
+ "failed_to_fetch_kubiya_runners",
65
+ error=str(e),
66
+ org_id=org_id
67
+ )
68
+ # If we can't get runners from Kubiya, fall back to checking common ones
69
+ runners = [{"name": "default"}]
70
+
71
+ environments_info = []
72
+
73
+ for runner in runners:
74
+ # Runner might be a dict or a string
75
+ if isinstance(runner, dict):
76
+ runner_name = runner.get("name", "default")
77
+ else:
78
+ runner_name = str(runner) if runner else "default"
79
+
80
+ task_queue = f"{org_id}.{runner_name}"
81
+
82
+ try:
83
+ # Describe the task queue to get worker information
84
+ desc = await temporal_client.describe_task_queue(
85
+ task_queue=task_queue,
86
+ task_queue_type=1, # TaskQueueType.WORKFLOW
87
+ )
88
+
89
+ workers = []
90
+ approximate_backlog = None
91
+
92
+ # Extract worker information from pollers
93
+ if desc.pollers:
94
+ for poller in desc.pollers:
95
+ worker_info = WorkerInfo(
96
+ identity=poller.identity,
97
+ last_access_time=poller.last_access_time.isoformat() if poller.last_access_time else None,
98
+ rate_per_second=poller.rate_per_second if hasattr(poller, 'rate_per_second') else None,
99
+ )
100
+ workers.append(worker_info)
101
+
102
+ # Get approximate backlog count if available
103
+ if hasattr(desc, 'approximate_backlog_count'):
104
+ approximate_backlog = desc.approximate_backlog_count
105
+
106
+ task_queue_info = TaskQueueInfo(
107
+ task_queue=task_queue,
108
+ organization_id=org_id,
109
+ runner_name=runner_name,
110
+ workers=workers,
111
+ worker_count=len(workers),
112
+ approximate_backlog_count=approximate_backlog,
113
+ )
114
+
115
+ environments_info.append(task_queue_info)
116
+
117
+ logger.info(
118
+ "task_queue_described",
119
+ task_queue=task_queue,
120
+ worker_count=len(workers),
121
+ org_id=org_id,
122
+ )
123
+
124
+ except Exception as e:
125
+ # Task queue might not exist yet if no worker has registered
126
+ logger.debug(
127
+ "task_queue_not_found",
128
+ task_queue=task_queue,
129
+ error=str(e),
130
+ org_id=org_id,
131
+ )
132
+ # Add empty task queue info
133
+ task_queue_info = TaskQueueInfo(
134
+ task_queue=task_queue,
135
+ organization_id=org_id,
136
+ runner_name=runner_name,
137
+ workers=[],
138
+ worker_count=0,
139
+ approximate_backlog_count=None,
140
+ )
141
+ environments_info.append(task_queue_info)
142
+
143
+ logger.info(
144
+ "workers_listed",
145
+ org_id=org_id,
146
+ task_queue_count=len(environments_info),
147
+ total_workers=sum(tq.worker_count for tq in environments_info),
148
+ )
149
+
150
+ return environments_info
151
+
152
+ except Exception as e:
153
+ logger.error(
154
+ "workers_list_failed",
155
+ error=str(e),
156
+ org_id=organization["id"]
157
+ )
158
+ raise HTTPException(
159
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
160
+ detail=f"Failed to list workers: {str(e)}"
161
+ )
162
+
163
+
164
+ @router.get("/{runner_name}", response_model=TaskQueueInfo)
165
+ async def get_workers_for_runner(
166
+ runner_name: str,
167
+ request: Request,
168
+ organization: dict = Depends(get_current_organization),
169
+ ):
170
+ """
171
+ Get worker information for a specific runner.
172
+
173
+ Args:
174
+ runner_name: The runner name (e.g., "default", "production-runner")
175
+ """
176
+ try:
177
+ temporal_client = await get_temporal_client()
178
+ org_id = organization["id"]
179
+ task_queue = f"{org_id}.{runner_name}"
180
+
181
+ try:
182
+ # Describe the task queue
183
+ desc = await temporal_client.describe_task_queue(
184
+ task_queue=task_queue,
185
+ task_queue_type=1, # TaskQueueType.WORKFLOW
186
+ )
187
+
188
+ workers = []
189
+ approximate_backlog = None
190
+
191
+ # Extract worker information
192
+ if desc.pollers:
193
+ for poller in desc.pollers:
194
+ worker_info = WorkerInfo(
195
+ identity=poller.identity,
196
+ last_access_time=poller.last_access_time.isoformat() if poller.last_access_time else None,
197
+ rate_per_second=poller.rate_per_second if hasattr(poller, 'rate_per_second') else None,
198
+ )
199
+ workers.append(worker_info)
200
+
201
+ if hasattr(desc, 'approximate_backlog_count'):
202
+ approximate_backlog = desc.approximate_backlog_count
203
+
204
+ task_queue_info = TaskQueueInfo(
205
+ task_queue=task_queue,
206
+ organization_id=org_id,
207
+ runner_name=runner_name,
208
+ workers=workers,
209
+ worker_count=len(workers),
210
+ approximate_backlog_count=approximate_backlog,
211
+ )
212
+
213
+ logger.info(
214
+ "workers_fetched_for_runner",
215
+ runner_name=runner_name,
216
+ worker_count=len(workers),
217
+ org_id=org_id,
218
+ )
219
+
220
+ return task_queue_info
221
+
222
+ except Exception as e:
223
+ logger.warning(
224
+ "task_queue_not_found",
225
+ task_queue=task_queue,
226
+ error=str(e),
227
+ org_id=org_id,
228
+ )
229
+ # Return empty worker info if task queue doesn't exist
230
+ return TaskQueueInfo(
231
+ task_queue=task_queue,
232
+ organization_id=org_id,
233
+ runner_name=runner_name,
234
+ workers=[],
235
+ worker_count=0,
236
+ approximate_backlog_count=None,
237
+ )
238
+
239
+ except Exception as e:
240
+ logger.error(
241
+ "workers_fetch_failed",
242
+ error=str(e),
243
+ runner_name=runner_name,
244
+ org_id=organization["id"]
245
+ )
246
+ raise HTTPException(
247
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
248
+ detail=f"Failed to fetch workers: {str(e)}"
249
+ )
250
+
251
+
252
+ # Worker Registration for Decoupled Architecture
253
+
254
+
255
+ class WorkerRegistrationRequest(BaseModel):
256
+ """Worker registration request"""
257
+ environment_name: str # Task queue / environment name worker wants to join
258
+ hostname: Optional[str] = None
259
+ worker_metadata: Dict[str, Any] = {}
260
+
261
+
262
+ class WorkerRegistrationResponse(BaseModel):
263
+ """Worker registration response with all config needed"""
264
+ worker_id: str # Unique worker ID
265
+ worker_token: str # Token for this worker (from environment)
266
+ environment_name: str # Task queue name (format: org_id.environment)
267
+ temporal_namespace: str
268
+ temporal_host: str
269
+ temporal_api_key: str
270
+ organization_id: str
271
+ control_plane_url: str
272
+
273
+
274
+ class WorkerHeartbeatRequest(BaseModel):
275
+ """Worker heartbeat request"""
276
+ worker_id: str
277
+ environment_name: str
278
+ status: str = "active" # active, idle, busy
279
+ tasks_processed: int = 0
280
+ current_task_id: Optional[str] = None
281
+ worker_metadata: Dict[str, Any] = {}
282
+
283
+
284
+ @router.post("/register", response_model=WorkerRegistrationResponse)
285
+ async def register_worker(
286
+ registration: WorkerRegistrationRequest,
287
+ request: Request,
288
+ organization: dict = Depends(get_current_organization),
289
+ ):
290
+ """
291
+ Register a new worker with the control plane.
292
+
293
+ This endpoint is called by workers on startup to get their configuration.
294
+ The worker authenticates using KUBIYA_API_KEY (same auth as other API calls).
295
+
296
+ Returns:
297
+ All configuration needed for worker to connect to Temporal and operate:
298
+ - worker_id: Unique ID for this worker instance
299
+ - worker_token: Environment's worker token
300
+ - environment_name: Formatted task queue name (org_id.environment)
301
+ - temporal_namespace, temporal_host, temporal_api_key: Temporal Cloud config
302
+ - organization_id: Organization ID
303
+ - control_plane_url: URL to send heartbeats
304
+ """
305
+ try:
306
+ client = get_supabase()
307
+ org_id = organization["id"]
308
+
309
+ # Look up the environment by name
310
+ env_result = (
311
+ client.table("environments")
312
+ .select("*")
313
+ .eq("organization_id", org_id)
314
+ .eq("name", registration.environment_name)
315
+ .execute()
316
+ )
317
+
318
+ # If environment doesn't exist, create it
319
+ if not env_result.data or len(env_result.data) == 0:
320
+ logger.info(
321
+ "creating_environment_for_worker",
322
+ environment_name=registration.environment_name,
323
+ org_id=org_id,
324
+ )
325
+
326
+ # Generate worker token for this environment (UUID format)
327
+ worker_token = str(uuid.uuid4())
328
+
329
+ # Create the environment
330
+ new_env = {
331
+ "id": str(uuid.uuid4()),
332
+ "organization_id": org_id,
333
+ "name": registration.environment_name,
334
+ "worker_token": worker_token,
335
+ "status": "active", # Mark as active immediately
336
+ "created_at": datetime.utcnow().isoformat(),
337
+ "updated_at": datetime.utcnow().isoformat(),
338
+ }
339
+
340
+ env_create_result = (
341
+ client.table("environments")
342
+ .insert(new_env)
343
+ .execute()
344
+ )
345
+
346
+ environment = env_create_result.data[0]
347
+
348
+ logger.info(
349
+ "environment_created_for_worker",
350
+ environment_name=registration.environment_name,
351
+ environment_id=environment["id"],
352
+ org_id=org_id,
353
+ )
354
+ else:
355
+ environment = env_result.data[0]
356
+
357
+ # Check if environment is ready
358
+ if environment.get("status") not in ["ready", "active"]:
359
+ raise HTTPException(
360
+ status_code=status.HTTP_400_BAD_REQUEST,
361
+ detail=f"Environment is not ready (status: {environment.get('status')}). "
362
+ f"Please wait for provisioning to complete."
363
+ )
364
+
365
+ # TEMPORARY: Skip provisioning and use fixed namespace + admin API key
366
+ # Get temporal namespace for this organization
367
+ import os
368
+
369
+ # Use fixed namespace for testing
370
+ namespace = {
371
+ "namespace_name": "agent-control-plane.lpagu",
372
+ "api_key_encrypted": os.getenv("TEMPORAL_CLOUD_ADMIN_TOKEN", ""),
373
+ "status": "ready"
374
+ }
375
+
376
+ logger.info(
377
+ "using_fixed_namespace_for_testing",
378
+ namespace_name=namespace["namespace_name"],
379
+ org_id=org_id,
380
+ )
381
+
382
+ # Generate worker ID
383
+ worker_id = str(uuid.uuid4())
384
+
385
+ # Create worker record in database
386
+ worker_record = {
387
+ "id": worker_id, # Use id as primary key
388
+ "worker_id": worker_id, # Also set worker_id (has NOT NULL constraint)
389
+ "organization_id": org_id,
390
+ "environment_name": registration.environment_name,
391
+ "worker_token": environment.get("worker_token"),
392
+ "hostname": registration.hostname,
393
+ "worker_metadata": registration.worker_metadata,
394
+ "status": "active",
395
+ "tasks_processed": 0,
396
+ "registered_at": datetime.utcnow().isoformat(),
397
+ "last_heartbeat": datetime.utcnow().isoformat(),
398
+ "updated_at": datetime.utcnow().isoformat(),
399
+ }
400
+
401
+ client.table("worker_heartbeats").insert(worker_record).execute()
402
+
403
+ # Format task queue name: org_id.environment_name
404
+ task_queue_name = f"{org_id}.{registration.environment_name}"
405
+
406
+ # Get Temporal Cloud configuration
407
+ import os
408
+ temporal_host = os.getenv("TEMPORAL_HOST", "us-east-1.aws.api.temporal.io:7233")
409
+
410
+ # Decrypt API key from namespace (TODO: implement proper decryption)
411
+ temporal_api_key = namespace.get("api_key_encrypted", "")
412
+
413
+ # Get control plane URL from environment or construct from request
414
+ control_plane_url = os.getenv("CONTROL_PLANE_URL")
415
+ if not control_plane_url:
416
+ # Construct from request if not set
417
+ control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
418
+
419
+ logger.info(
420
+ "worker_registered",
421
+ worker_id=worker_id,
422
+ environment_name=registration.environment_name,
423
+ task_queue=task_queue_name,
424
+ org_id=org_id,
425
+ )
426
+
427
+ return WorkerRegistrationResponse(
428
+ worker_id=worker_id,
429
+ worker_token=environment.get("worker_token"),
430
+ environment_name=task_queue_name, # Return formatted name
431
+ temporal_namespace=namespace.get("namespace_name"),
432
+ temporal_host=temporal_host,
433
+ temporal_api_key=temporal_api_key,
434
+ organization_id=org_id,
435
+ control_plane_url=control_plane_url,
436
+ )
437
+
438
+ except HTTPException:
439
+ raise
440
+ except Exception as e:
441
+ logger.error(
442
+ "worker_registration_failed",
443
+ error=str(e),
444
+ environment_name=registration.environment_name,
445
+ org_id=organization["id"]
446
+ )
447
+ raise HTTPException(
448
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
449
+ detail=f"Failed to register worker: {str(e)}"
450
+ )
451
+
452
+
453
+ @router.post("/heartbeat", status_code=status.HTTP_204_NO_CONTENT)
454
+ async def worker_heartbeat(
455
+ heartbeat: WorkerHeartbeatRequest,
456
+ request: Request,
457
+ organization: dict = Depends(get_current_organization),
458
+ ):
459
+ """
460
+ Receive heartbeat from a worker.
461
+
462
+ OPTIMIZATION: Uses Redis for scalable heartbeat storage instead of database.
463
+ Database writes are expensive and heartbeats happen every 30s per worker.
464
+
465
+ Workers should call this endpoint periodically (e.g., every 30 seconds) to:
466
+ - Confirm they're still alive
467
+ - Update their status (active, idle, busy)
468
+ - Report tasks processed
469
+ - Update metadata
470
+ """
471
+ try:
472
+ org_id = organization["id"]
473
+ redis_client = get_redis_client()
474
+
475
+ if not redis_client:
476
+ # Redis not available - log warning but don't fail (graceful degradation)
477
+ logger.warning(
478
+ "worker_heartbeat_redis_unavailable",
479
+ worker_id=heartbeat.worker_id,
480
+ org_id=org_id,
481
+ )
482
+ return None
483
+
484
+ # Build heartbeat data for Redis
485
+ heartbeat_data = {
486
+ "worker_id": heartbeat.worker_id,
487
+ "organization_id": org_id,
488
+ "environment_name": heartbeat.environment_name,
489
+ "status": heartbeat.status,
490
+ "tasks_processed": heartbeat.tasks_processed,
491
+ "current_task_id": heartbeat.current_task_id,
492
+ "last_heartbeat": datetime.utcnow().isoformat(),
493
+ "metadata": heartbeat.worker_metadata,
494
+ }
495
+
496
+ # Store in Redis with 5-minute TTL (if worker crashes, heartbeat expires)
497
+ redis_key = f"worker:{heartbeat.worker_id}:heartbeat"
498
+ await redis_client.set(redis_key, json.dumps(heartbeat_data), ex=300)
499
+
500
+ logger.debug(
501
+ "worker_heartbeat_received",
502
+ worker_id=heartbeat.worker_id,
503
+ status=heartbeat.status,
504
+ environment_name=heartbeat.environment_name,
505
+ org_id=org_id,
506
+ )
507
+
508
+ return None
509
+
510
+ except Exception as e:
511
+ logger.error(
512
+ "worker_heartbeat_failed",
513
+ error=str(e),
514
+ worker_id=heartbeat.worker_id,
515
+ org_id=organization["id"]
516
+ )
517
+ # Don't fail the worker if heartbeat fails - graceful degradation
518
+ return None
519
+
520
+
521
+ # Worker ID-based endpoints (new architecture)
522
+
523
+
524
+ class WorkerStartRequest(BaseModel):
525
+ """Request to start a worker and fetch its config"""
526
+ system_info: Dict[str, Any] = {}
527
+
528
+
529
+ class WorkerConfigResponse(BaseModel):
530
+ """Worker configuration response"""
531
+ worker_id: str
532
+ worker_queue_name: str
533
+ environment_name: str
534
+ task_queue_name: str # Full: org.env.worker_queue
535
+ temporal_namespace: str
536
+ temporal_host: str
537
+ temporal_api_key: str
538
+ organization_id: str
539
+ control_plane_url: str
540
+ heartbeat_interval: int = 30
541
+ # LiteLLM configuration
542
+ litellm_api_url: str
543
+ litellm_api_key: str
544
+
545
+
546
+ class WorkerSystemInfo(BaseModel):
547
+ """Worker system information"""
548
+ hostname: Optional[str] = None
549
+ platform: Optional[str] = None
550
+ os_name: Optional[str] = None
551
+ os_version: Optional[str] = None
552
+ python_version: Optional[str] = None
553
+ cli_version: Optional[str] = None
554
+ docker_available: Optional[bool] = None
555
+ docker_version: Optional[str] = None
556
+ cpu_count: Optional[int] = None
557
+ cpu_percent: Optional[float] = None
558
+ memory_total: Optional[int] = None # bytes
559
+ memory_used: Optional[int] = None # bytes
560
+ memory_percent: Optional[float] = None
561
+ disk_total: Optional[int] = None # bytes
562
+ disk_used: Optional[int] = None # bytes
563
+ disk_percent: Optional[float] = None
564
+ uptime_seconds: Optional[float] = None
565
+
566
+
567
+ class WorkerHeartbeatSimple(BaseModel):
568
+ """Simplified heartbeat request (worker_id in URL)"""
569
+ status: str = "active"
570
+ tasks_processed: int = 0
571
+ current_task_id: Optional[str] = None
572
+ worker_metadata: Dict[str, Any] = {}
573
+ system_info: Optional[WorkerSystemInfo] = None
574
+ logs: Optional[List[str]] = None # Recent log lines since last heartbeat
575
+
576
+
577
+ @router.post("/{worker_id}/start", response_model=WorkerConfigResponse)
578
+ async def start_worker(
579
+ worker_id: str,
580
+ start_request: WorkerStartRequest,
581
+ request: Request,
582
+ organization: dict = Depends(get_current_organization),
583
+ ):
584
+ """
585
+ Start a worker and fetch its configuration.
586
+
587
+ This endpoint is called by workers on startup with just worker_id and API key.
588
+ It returns all necessary configuration for the worker to connect to Temporal.
589
+
590
+ Args:
591
+ worker_id: Worker ID (UUID created in UI)
592
+ start_request: System information from worker
593
+
594
+ Returns:
595
+ Complete worker configuration including Temporal credentials
596
+ """
597
+ try:
598
+ client = get_supabase()
599
+ org_id = organization["id"]
600
+
601
+ # Look up worker in database
602
+ worker_result = (
603
+ client.table("worker_heartbeats")
604
+ .select("*")
605
+ .eq("id", worker_id)
606
+ .eq("organization_id", org_id)
607
+ .single()
608
+ .execute()
609
+ )
610
+
611
+ if not worker_result.data:
612
+ raise HTTPException(
613
+ status_code=status.HTTP_404_NOT_FOUND,
614
+ detail=f"Worker '{worker_id}' not found"
615
+ )
616
+
617
+ worker = worker_result.data
618
+
619
+ # Get worker queue separately
620
+ if not worker.get("worker_queue_id"):
621
+ raise HTTPException(
622
+ status_code=status.HTTP_400_BAD_REQUEST,
623
+ detail=f"Worker has no queue assigned"
624
+ )
625
+
626
+ queue_result = (
627
+ client.table("worker_queues")
628
+ .select("*")
629
+ .eq("id", worker["worker_queue_id"])
630
+ .eq("organization_id", org_id)
631
+ .single()
632
+ .execute()
633
+ )
634
+
635
+ if not queue_result.data:
636
+ raise HTTPException(
637
+ status_code=status.HTTP_404_NOT_FOUND,
638
+ detail=f"Worker queue not found"
639
+ )
640
+
641
+ worker_queue = queue_result.data
642
+ worker_queue_name = worker_queue["name"]
643
+
644
+ # Get environment separately
645
+ environment_name = "default"
646
+ if worker_queue.get("environment_id"):
647
+ env_result = (
648
+ client.table("environments")
649
+ .select("name")
650
+ .eq("id", worker_queue["environment_id"])
651
+ .eq("organization_id", org_id)
652
+ .maybe_single()
653
+ .execute()
654
+ )
655
+ if env_result.data:
656
+ environment_name = env_result.data["name"]
657
+
658
+ # TEMPORARY: Skip database lookup and use fixed namespace + admin API key
659
+ import os
660
+
661
+ # Use fixed namespace for testing
662
+ namespace = {
663
+ "namespace_name": "agent-control-plane.lpagu",
664
+ "api_key_encrypted": os.getenv("TEMPORAL_CLOUD_ADMIN_TOKEN", ""),
665
+ "status": "ready"
666
+ }
667
+
668
+ logger.info(
669
+ "using_fixed_namespace_for_testing",
670
+ namespace_name=namespace["namespace_name"],
671
+ worker_id=worker_id,
672
+ org_id=org_id,
673
+ )
674
+
675
+ # Update worker with system info and mark as starting
676
+ update_data = {
677
+ "worker_metadata": {
678
+ **worker.get("worker_metadata", {}),
679
+ **start_request.system_info,
680
+ "last_start": datetime.utcnow().isoformat(),
681
+ },
682
+ "status": "active",
683
+ "last_heartbeat": datetime.utcnow().isoformat(),
684
+ "updated_at": datetime.utcnow().isoformat(),
685
+ }
686
+
687
+ client.table("worker_heartbeats").update(update_data).eq("id", worker_id).execute()
688
+
689
+ # Build full task queue name
690
+ task_queue_name = f"{org_id}.{environment_name}.{worker_queue_name}"
691
+
692
+ # Get Temporal Cloud configuration
693
+ import os
694
+ temporal_host = os.getenv("TEMPORAL_HOST", "us-east-1.aws.api.temporal.io:7233")
695
+ temporal_api_key = namespace.get("api_key_encrypted", "")
696
+
697
+ # Get control plane URL
698
+ control_plane_url = os.getenv("CONTROL_PLANE_URL")
699
+ if not control_plane_url:
700
+ control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
701
+
702
+ # Get LiteLLM configuration from environment
703
+ litellm_api_url = os.getenv("LITELLM_API_URL", "https://api.openai.com/v1")
704
+ litellm_api_key = os.getenv("LITELLM_API_KEY", "")
705
+
706
+ logger.info(
707
+ "worker_config_fetched",
708
+ worker_id=worker_id,
709
+ task_queue=task_queue_name,
710
+ environment=environment_name,
711
+ worker_queue=worker_queue_name,
712
+ org_id=org_id,
713
+ )
714
+
715
+ return WorkerConfigResponse(
716
+ worker_id=worker_id,
717
+ worker_queue_name=worker_queue_name,
718
+ environment_name=environment_name,
719
+ task_queue_name=task_queue_name,
720
+ temporal_namespace=namespace.get("namespace_name"),
721
+ temporal_host=temporal_host,
722
+ temporal_api_key=temporal_api_key,
723
+ organization_id=org_id,
724
+ control_plane_url=control_plane_url,
725
+ heartbeat_interval=worker_queue.get("heartbeat_interval", 30),
726
+ litellm_api_url=litellm_api_url,
727
+ litellm_api_key=litellm_api_key,
728
+ )
729
+
730
+ except HTTPException:
731
+ raise
732
+ except Exception as e:
733
+ logger.error(
734
+ "worker_start_failed",
735
+ error=str(e),
736
+ worker_id=worker_id,
737
+ org_id=organization.get("id")
738
+ )
739
+ raise HTTPException(
740
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
741
+ detail=f"Failed to start worker: {str(e)}"
742
+ )
743
+
744
+
745
+ @router.post("/{worker_id}/heartbeat", status_code=status.HTTP_204_NO_CONTENT)
746
+ async def worker_heartbeat_simple(
747
+ worker_id: str,
748
+ heartbeat: WorkerHeartbeatSimple,
749
+ request: Request,
750
+ organization: dict = Depends(get_current_organization),
751
+ ):
752
+ """
753
+ Receive heartbeat from a worker (simplified version with worker_id in URL).
754
+
755
+ OPTIMIZATION: Uses Redis for scalable heartbeat storage instead of database.
756
+ Database writes are expensive and heartbeats happen every 30s per worker.
757
+ Redis provides sub-millisecond writes and automatic TTL expiration.
758
+
759
+ Args:
760
+ worker_id: Worker ID (UUID)
761
+ heartbeat: Heartbeat data
762
+ """
763
+ try:
764
+ org_id = organization["id"]
765
+ redis_client = get_redis_client()
766
+
767
+ if not redis_client:
768
+ # Redis not available - log warning but don't fail (graceful degradation)
769
+ logger.warning(
770
+ "worker_heartbeat_redis_unavailable",
771
+ worker_id=worker_id,
772
+ org_id=org_id,
773
+ )
774
+ return None
775
+
776
+ # Build heartbeat data for Redis
777
+ heartbeat_data = {
778
+ "worker_id": worker_id,
779
+ "organization_id": org_id,
780
+ "status": heartbeat.status,
781
+ "tasks_processed": heartbeat.tasks_processed,
782
+ "current_task_id": heartbeat.current_task_id,
783
+ "last_heartbeat": datetime.utcnow().isoformat(),
784
+ "metadata": heartbeat.worker_metadata,
785
+ }
786
+
787
+ if heartbeat.system_info:
788
+ heartbeat_data["system_info"] = heartbeat.system_info.dict(exclude_none=True)
789
+
790
+ # Handle logs - fetch from Redis and append new logs
791
+ redis_key = f"worker:{worker_id}:heartbeat"
792
+ if heartbeat.logs:
793
+ try:
794
+ # Get existing heartbeat data to retrieve logs
795
+ existing_data = await redis_client.get(redis_key)
796
+ if existing_data:
797
+ existing_heartbeat = json.loads(existing_data)
798
+ existing_logs = existing_heartbeat.get("logs", [])
799
+ all_logs = existing_logs + heartbeat.logs
800
+ heartbeat_data["logs"] = all_logs[-100:] # Keep last 100 lines
801
+ else:
802
+ heartbeat_data["logs"] = heartbeat.logs[-100:]
803
+ except Exception as log_error:
804
+ logger.warning("heartbeat_log_merge_failed", error=str(log_error))
805
+ heartbeat_data["logs"] = heartbeat.logs[-100:]
806
+
807
+ # Store in Redis with 5-minute TTL (if worker crashes, heartbeat expires)
808
+ # TTL is 10x the heartbeat interval (30s * 10 = 300s) for safety
809
+ await redis_client.set(redis_key, json.dumps(heartbeat_data), ex=300)
810
+
811
+ logger.debug(
812
+ "worker_heartbeat_received",
813
+ worker_id=worker_id,
814
+ status=heartbeat.status,
815
+ org_id=org_id,
816
+ )
817
+
818
+ return None
819
+
820
+ except Exception as e:
821
+ logger.error(
822
+ "worker_heartbeat_failed",
823
+ error=str(e),
824
+ worker_id=worker_id,
825
+ org_id=organization.get("id")
826
+ )
827
+ # Don't fail the worker if heartbeat fails - graceful degradation
828
+ return None
829
+
830
+
831
+ class WorkerDisconnectRequest(BaseModel):
832
+ """Worker disconnect request"""
833
+ reason: str = "shutdown" # shutdown, error, crash, etc.
834
+ exit_code: Optional[int] = None
835
+ error_message: Optional[str] = None
836
+
837
+
838
+ @router.post("/{worker_id}/disconnect", status_code=status.HTTP_204_NO_CONTENT)
839
+ async def worker_disconnect(
840
+ worker_id: str,
841
+ disconnect: WorkerDisconnectRequest,
842
+ request: Request,
843
+ organization: dict = Depends(get_current_organization),
844
+ ):
845
+ """
846
+ Mark a worker as disconnected/offline.
847
+
848
+ This endpoint is called by workers when they:
849
+ - Shut down gracefully (Ctrl+C)
850
+ - Exit due to an error
851
+ - Crash unexpectedly (via atexit handler)
852
+
853
+ Args:
854
+ worker_id: Worker ID (UUID)
855
+ disconnect: Disconnect details (reason, exit code, error)
856
+ """
857
+ try:
858
+ client = get_supabase()
859
+ org_id = organization["id"]
860
+
861
+ # Update worker status to disconnected in database
862
+ update_data = {
863
+ "status": "disconnected",
864
+ "last_heartbeat": datetime.utcnow().isoformat(),
865
+ "worker_metadata": {
866
+ "disconnect_reason": disconnect.reason,
867
+ "disconnect_time": datetime.utcnow().isoformat(),
868
+ "exit_code": disconnect.exit_code,
869
+ "error_message": disconnect.error_message,
870
+ },
871
+ "updated_at": datetime.utcnow().isoformat(),
872
+ }
873
+
874
+ result = (
875
+ client.table("worker_heartbeats")
876
+ .update(update_data)
877
+ .eq("id", worker_id)
878
+ .eq("organization_id", org_id)
879
+ .execute()
880
+ )
881
+
882
+ if not result.data:
883
+ logger.warning(
884
+ "worker_disconnect_not_found",
885
+ worker_id=worker_id,
886
+ org_id=org_id,
887
+ )
888
+ raise HTTPException(
889
+ status_code=status.HTTP_404_NOT_FOUND,
890
+ detail="Worker not found"
891
+ )
892
+
893
+ # IMPORTANT: Also remove from Redis immediately so UI updates instantly
894
+ redis_client = get_redis_client()
895
+ if redis_client:
896
+ redis_key = f"worker:{worker_id}:heartbeat"
897
+ try:
898
+ # Delete the heartbeat key from Redis
899
+ await redis_client.delete(redis_key)
900
+ logger.info(
901
+ "worker_removed_from_redis",
902
+ worker_id=worker_id,
903
+ redis_key=redis_key
904
+ )
905
+ except Exception as redis_error:
906
+ # Log but don't fail the disconnect
907
+ logger.warning(
908
+ "redis_delete_failed",
909
+ error=str(redis_error),
910
+ worker_id=worker_id
911
+ )
912
+
913
+ logger.info(
914
+ "worker_disconnected",
915
+ worker_id=worker_id,
916
+ reason=disconnect.reason,
917
+ exit_code=disconnect.exit_code,
918
+ org_id=org_id,
919
+ )
920
+
921
+ return None
922
+
923
+ except HTTPException:
924
+ raise
925
+ except Exception as e:
926
+ logger.error(
927
+ "worker_disconnect_failed",
928
+ error=str(e),
929
+ worker_id=worker_id,
930
+ org_id=organization.get("id")
931
+ )
932
+ raise HTTPException(
933
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
934
+ detail=f"Failed to process disconnect: {str(e)}"
935
+ )