kubiya-control-plane-api 0.1.0__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubiya-control-plane-api might be problematic. Click here for more details.

Files changed (185) hide show
  1. control_plane_api/README.md +266 -0
  2. control_plane_api/__init__.py +0 -0
  3. control_plane_api/__version__.py +1 -0
  4. control_plane_api/alembic/README +1 -0
  5. control_plane_api/alembic/env.py +98 -0
  6. control_plane_api/alembic/script.py.mako +28 -0
  7. control_plane_api/alembic/versions/1382bec74309_initial_migration_with_all_models.py +251 -0
  8. control_plane_api/alembic/versions/1f54bc2a37e3_add_analytics_tables.py +162 -0
  9. control_plane_api/alembic/versions/2e4cb136dc10_rename_toolset_ids_to_skill_ids_in_teams.py +30 -0
  10. control_plane_api/alembic/versions/31cd69a644ce_add_skill_templates_table.py +28 -0
  11. control_plane_api/alembic/versions/89e127caa47d_add_jobs_and_job_executions_tables.py +161 -0
  12. control_plane_api/alembic/versions/add_llm_models_table.py +51 -0
  13. control_plane_api/alembic/versions/b0e10697f212_add_runtime_column_to_teams_simple.py +42 -0
  14. control_plane_api/alembic/versions/ce43b24b63bf_add_execution_trigger_source_and_fix_.py +155 -0
  15. control_plane_api/alembic/versions/d4eaf16e3f8d_rename_toolsets_to_skills.py +84 -0
  16. control_plane_api/alembic/versions/efa2dc427da1_rename_metadata_to_custom_metadata.py +32 -0
  17. control_plane_api/alembic/versions/f973b431d1ce_add_workflow_executor_to_skill_types.py +44 -0
  18. control_plane_api/alembic.ini +148 -0
  19. control_plane_api/api/index.py +12 -0
  20. control_plane_api/app/__init__.py +11 -0
  21. control_plane_api/app/activities/__init__.py +20 -0
  22. control_plane_api/app/activities/agent_activities.py +379 -0
  23. control_plane_api/app/activities/team_activities.py +410 -0
  24. control_plane_api/app/activities/temporal_cloud_activities.py +577 -0
  25. control_plane_api/app/config/__init__.py +35 -0
  26. control_plane_api/app/config/api_config.py +354 -0
  27. control_plane_api/app/config/model_pricing.py +318 -0
  28. control_plane_api/app/config.py +95 -0
  29. control_plane_api/app/database.py +135 -0
  30. control_plane_api/app/exceptions.py +408 -0
  31. control_plane_api/app/lib/__init__.py +11 -0
  32. control_plane_api/app/lib/job_executor.py +312 -0
  33. control_plane_api/app/lib/kubiya_client.py +235 -0
  34. control_plane_api/app/lib/litellm_pricing.py +166 -0
  35. control_plane_api/app/lib/planning_tools/__init__.py +22 -0
  36. control_plane_api/app/lib/planning_tools/agents.py +155 -0
  37. control_plane_api/app/lib/planning_tools/base.py +189 -0
  38. control_plane_api/app/lib/planning_tools/environments.py +214 -0
  39. control_plane_api/app/lib/planning_tools/resources.py +240 -0
  40. control_plane_api/app/lib/planning_tools/teams.py +198 -0
  41. control_plane_api/app/lib/policy_enforcer_client.py +939 -0
  42. control_plane_api/app/lib/redis_client.py +436 -0
  43. control_plane_api/app/lib/supabase.py +71 -0
  44. control_plane_api/app/lib/temporal_client.py +138 -0
  45. control_plane_api/app/lib/validation/__init__.py +20 -0
  46. control_plane_api/app/lib/validation/runtime_validation.py +287 -0
  47. control_plane_api/app/main.py +128 -0
  48. control_plane_api/app/middleware/__init__.py +8 -0
  49. control_plane_api/app/middleware/auth.py +513 -0
  50. control_plane_api/app/middleware/exception_handler.py +267 -0
  51. control_plane_api/app/middleware/rate_limiting.py +384 -0
  52. control_plane_api/app/middleware/request_id.py +202 -0
  53. control_plane_api/app/models/__init__.py +27 -0
  54. control_plane_api/app/models/agent.py +79 -0
  55. control_plane_api/app/models/analytics.py +206 -0
  56. control_plane_api/app/models/associations.py +81 -0
  57. control_plane_api/app/models/environment.py +63 -0
  58. control_plane_api/app/models/execution.py +93 -0
  59. control_plane_api/app/models/job.py +179 -0
  60. control_plane_api/app/models/llm_model.py +75 -0
  61. control_plane_api/app/models/presence.py +49 -0
  62. control_plane_api/app/models/project.py +47 -0
  63. control_plane_api/app/models/session.py +38 -0
  64. control_plane_api/app/models/team.py +66 -0
  65. control_plane_api/app/models/workflow.py +55 -0
  66. control_plane_api/app/policies/README.md +121 -0
  67. control_plane_api/app/policies/approved_users.rego +62 -0
  68. control_plane_api/app/policies/business_hours.rego +51 -0
  69. control_plane_api/app/policies/rate_limiting.rego +100 -0
  70. control_plane_api/app/policies/tool_restrictions.rego +86 -0
  71. control_plane_api/app/routers/__init__.py +4 -0
  72. control_plane_api/app/routers/agents.py +364 -0
  73. control_plane_api/app/routers/agents_v2.py +1260 -0
  74. control_plane_api/app/routers/analytics.py +1014 -0
  75. control_plane_api/app/routers/context_manager.py +562 -0
  76. control_plane_api/app/routers/environment_context.py +270 -0
  77. control_plane_api/app/routers/environments.py +715 -0
  78. control_plane_api/app/routers/execution_environment.py +517 -0
  79. control_plane_api/app/routers/executions.py +1911 -0
  80. control_plane_api/app/routers/health.py +92 -0
  81. control_plane_api/app/routers/health_v2.py +326 -0
  82. control_plane_api/app/routers/integrations.py +274 -0
  83. control_plane_api/app/routers/jobs.py +1344 -0
  84. control_plane_api/app/routers/models.py +82 -0
  85. control_plane_api/app/routers/models_v2.py +361 -0
  86. control_plane_api/app/routers/policies.py +639 -0
  87. control_plane_api/app/routers/presence.py +234 -0
  88. control_plane_api/app/routers/projects.py +902 -0
  89. control_plane_api/app/routers/runners.py +379 -0
  90. control_plane_api/app/routers/runtimes.py +172 -0
  91. control_plane_api/app/routers/secrets.py +155 -0
  92. control_plane_api/app/routers/skills.py +1001 -0
  93. control_plane_api/app/routers/skills_definitions.py +140 -0
  94. control_plane_api/app/routers/task_planning.py +1256 -0
  95. control_plane_api/app/routers/task_queues.py +654 -0
  96. control_plane_api/app/routers/team_context.py +270 -0
  97. control_plane_api/app/routers/teams.py +1400 -0
  98. control_plane_api/app/routers/worker_queues.py +1545 -0
  99. control_plane_api/app/routers/workers.py +935 -0
  100. control_plane_api/app/routers/workflows.py +204 -0
  101. control_plane_api/app/runtimes/__init__.py +6 -0
  102. control_plane_api/app/runtimes/validation.py +344 -0
  103. control_plane_api/app/schemas/job_schemas.py +295 -0
  104. control_plane_api/app/services/__init__.py +1 -0
  105. control_plane_api/app/services/agno_service.py +619 -0
  106. control_plane_api/app/services/litellm_service.py +190 -0
  107. control_plane_api/app/services/policy_service.py +525 -0
  108. control_plane_api/app/services/temporal_cloud_provisioning.py +150 -0
  109. control_plane_api/app/skills/__init__.py +44 -0
  110. control_plane_api/app/skills/base.py +229 -0
  111. control_plane_api/app/skills/business_intelligence.py +189 -0
  112. control_plane_api/app/skills/data_visualization.py +154 -0
  113. control_plane_api/app/skills/docker.py +104 -0
  114. control_plane_api/app/skills/file_generation.py +94 -0
  115. control_plane_api/app/skills/file_system.py +110 -0
  116. control_plane_api/app/skills/python.py +92 -0
  117. control_plane_api/app/skills/registry.py +65 -0
  118. control_plane_api/app/skills/shell.py +102 -0
  119. control_plane_api/app/skills/workflow_executor.py +469 -0
  120. control_plane_api/app/utils/workflow_executor.py +354 -0
  121. control_plane_api/app/workflows/__init__.py +11 -0
  122. control_plane_api/app/workflows/agent_execution.py +507 -0
  123. control_plane_api/app/workflows/agent_execution_with_skills.py +222 -0
  124. control_plane_api/app/workflows/namespace_provisioning.py +326 -0
  125. control_plane_api/app/workflows/team_execution.py +399 -0
  126. control_plane_api/scripts/seed_models.py +239 -0
  127. control_plane_api/worker/__init__.py +0 -0
  128. control_plane_api/worker/activities/__init__.py +0 -0
  129. control_plane_api/worker/activities/agent_activities.py +1241 -0
  130. control_plane_api/worker/activities/approval_activities.py +234 -0
  131. control_plane_api/worker/activities/runtime_activities.py +388 -0
  132. control_plane_api/worker/activities/skill_activities.py +267 -0
  133. control_plane_api/worker/activities/team_activities.py +1217 -0
  134. control_plane_api/worker/config/__init__.py +31 -0
  135. control_plane_api/worker/config/worker_config.py +275 -0
  136. control_plane_api/worker/control_plane_client.py +529 -0
  137. control_plane_api/worker/examples/analytics_integration_example.py +362 -0
  138. control_plane_api/worker/models/__init__.py +1 -0
  139. control_plane_api/worker/models/inputs.py +89 -0
  140. control_plane_api/worker/runtimes/__init__.py +31 -0
  141. control_plane_api/worker/runtimes/base.py +789 -0
  142. control_plane_api/worker/runtimes/claude_code_runtime.py +1443 -0
  143. control_plane_api/worker/runtimes/default_runtime.py +617 -0
  144. control_plane_api/worker/runtimes/factory.py +173 -0
  145. control_plane_api/worker/runtimes/validation.py +93 -0
  146. control_plane_api/worker/services/__init__.py +1 -0
  147. control_plane_api/worker/services/agent_executor.py +422 -0
  148. control_plane_api/worker/services/agent_executor_v2.py +383 -0
  149. control_plane_api/worker/services/analytics_collector.py +457 -0
  150. control_plane_api/worker/services/analytics_service.py +464 -0
  151. control_plane_api/worker/services/approval_tools.py +310 -0
  152. control_plane_api/worker/services/approval_tools_agno.py +207 -0
  153. control_plane_api/worker/services/cancellation_manager.py +177 -0
  154. control_plane_api/worker/services/data_visualization.py +827 -0
  155. control_plane_api/worker/services/jira_tools.py +257 -0
  156. control_plane_api/worker/services/runtime_analytics.py +328 -0
  157. control_plane_api/worker/services/session_service.py +194 -0
  158. control_plane_api/worker/services/skill_factory.py +175 -0
  159. control_plane_api/worker/services/team_executor.py +574 -0
  160. control_plane_api/worker/services/team_executor_v2.py +465 -0
  161. control_plane_api/worker/services/workflow_executor_tools.py +1418 -0
  162. control_plane_api/worker/tests/__init__.py +1 -0
  163. control_plane_api/worker/tests/e2e/__init__.py +0 -0
  164. control_plane_api/worker/tests/e2e/test_execution_flow.py +571 -0
  165. control_plane_api/worker/tests/integration/__init__.py +0 -0
  166. control_plane_api/worker/tests/integration/test_control_plane_integration.py +308 -0
  167. control_plane_api/worker/tests/unit/__init__.py +0 -0
  168. control_plane_api/worker/tests/unit/test_control_plane_client.py +401 -0
  169. control_plane_api/worker/utils/__init__.py +1 -0
  170. control_plane_api/worker/utils/chunk_batcher.py +305 -0
  171. control_plane_api/worker/utils/retry_utils.py +60 -0
  172. control_plane_api/worker/utils/streaming_utils.py +373 -0
  173. control_plane_api/worker/worker.py +753 -0
  174. control_plane_api/worker/workflows/__init__.py +0 -0
  175. control_plane_api/worker/workflows/agent_execution.py +589 -0
  176. control_plane_api/worker/workflows/team_execution.py +429 -0
  177. kubiya_control_plane_api-0.3.4.dist-info/METADATA +229 -0
  178. kubiya_control_plane_api-0.3.4.dist-info/RECORD +182 -0
  179. kubiya_control_plane_api-0.3.4.dist-info/entry_points.txt +2 -0
  180. kubiya_control_plane_api-0.3.4.dist-info/top_level.txt +1 -0
  181. kubiya_control_plane_api-0.1.0.dist-info/METADATA +0 -66
  182. kubiya_control_plane_api-0.1.0.dist-info/RECORD +0 -5
  183. kubiya_control_plane_api-0.1.0.dist-info/top_level.txt +0 -1
  184. {kubiya_control_plane_api-0.1.0.dist-info/licenses → control_plane_api}/LICENSE +0 -0
  185. {kubiya_control_plane_api-0.1.0.dist-info → kubiya_control_plane_api-0.3.4.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1545 @@
1
+ """
2
+ Worker Queues router - Manage worker queues within environments.
3
+
4
+ Each environment can have multiple worker queues for fine-grained worker management.
5
+ Task queue naming: {org_id}.{environment_name}.{worker_queue_name}
6
+ """
7
+
8
+ from fastapi import APIRouter, Depends, HTTPException, status, Request
9
+ from fastapi.responses import PlainTextResponse
10
+ from typing import List, Optional, Literal
11
+ from datetime import datetime, timedelta
12
+ from pydantic import BaseModel, Field
13
+ import structlog
14
+ import uuid
15
+ import os
16
+ import json
17
+
18
+ from control_plane_api.app.middleware.auth import get_current_organization
19
+ from control_plane_api.app.lib.supabase import get_supabase
20
+ from control_plane_api.app.lib.redis_client import get_redis_client
21
+
22
+ logger = structlog.get_logger()
23
+
24
+ router = APIRouter()
25
+
26
+ # Stale worker threshold: 60 seconds (2x the default heartbeat interval of 30s)
27
+ STALE_WORKER_THRESHOLD_SECONDS = 60
28
+
29
+
30
+ async def get_active_workers_from_redis(org_id: str, queue_id: Optional[str] = None) -> dict:
31
+ """
32
+ Get active workers from Redis heartbeats.
33
+
34
+ Redis heartbeats have automatic TTL (5 minutes), so if a worker hasn't sent a heartbeat
35
+ the key will automatically expire. This eliminates the need to manually mark workers as stale.
36
+
37
+ Args:
38
+ org_id: Organization ID
39
+ queue_id: Optional queue ID to filter by
40
+
41
+ Returns:
42
+ Dict with worker_id -> heartbeat_data mapping
43
+ """
44
+ redis_client = get_redis_client()
45
+
46
+ if not redis_client:
47
+ logger.warning("redis_unavailable_for_worker_query", org_id=org_id)
48
+ return {}
49
+
50
+ try:
51
+ # Get all worker heartbeat keys for this org
52
+ # We need to get worker records from DB to map worker_id -> queue_id
53
+ client = get_supabase()
54
+ workers_db = (
55
+ client.table("worker_heartbeats")
56
+ .select("id, worker_queue_id")
57
+ .eq("organization_id", org_id)
58
+ .execute()
59
+ )
60
+
61
+ active_workers = {}
62
+
63
+ for worker in workers_db.data or []:
64
+ worker_id = worker["id"]
65
+ worker_queue_id = worker.get("worker_queue_id")
66
+
67
+ # Skip if queue_id filter is specified and doesn't match
68
+ if queue_id and worker_queue_id != queue_id:
69
+ continue
70
+
71
+ # Check if heartbeat exists in Redis (within TTL window)
72
+ redis_key = f"worker:{worker_id}:heartbeat"
73
+ heartbeat_data = await redis_client.get(redis_key)
74
+
75
+ if heartbeat_data:
76
+ try:
77
+ data = json.loads(heartbeat_data)
78
+ # Check if heartbeat is recent (within threshold)
79
+ last_heartbeat = datetime.fromisoformat(data.get("last_heartbeat", ""))
80
+ age_seconds = (datetime.utcnow() - last_heartbeat).total_seconds()
81
+
82
+ if age_seconds <= STALE_WORKER_THRESHOLD_SECONDS:
83
+ active_workers[worker_id] = {
84
+ **data,
85
+ "worker_queue_id": worker_queue_id,
86
+ }
87
+ except (json.JSONDecodeError, ValueError) as e:
88
+ logger.warning("invalid_heartbeat_data", worker_id=worker_id, error=str(e))
89
+ continue
90
+
91
+ return active_workers
92
+
93
+ except Exception as e:
94
+ logger.error("failed_to_get_active_workers_from_redis", error=str(e), org_id=org_id)
95
+ return {}
96
+
97
+
98
+ # Pydantic schemas
99
+ class WorkerQueueCreate(BaseModel):
100
+ name: str = Field(..., min_length=2, max_length=50, description="Worker queue name (lowercase, no spaces)")
101
+ display_name: Optional[str] = Field(None, description="User-friendly display name")
102
+ description: Optional[str] = Field(None, description="Queue description")
103
+ max_workers: Optional[int] = Field(None, ge=1, description="Max workers allowed (NULL = unlimited)")
104
+ heartbeat_interval: int = Field(30, ge=10, le=300, description="Seconds between heartbeats")
105
+ tags: List[str] = Field(default_factory=list)
106
+ settings: dict = Field(default_factory=dict)
107
+
108
+
109
+ class WorkerQueueUpdate(BaseModel):
110
+ name: Optional[str] = Field(None, min_length=2, max_length=50)
111
+ display_name: Optional[str] = None
112
+ description: Optional[str] = None
113
+ status: Optional[str] = None
114
+ max_workers: Optional[int] = Field(None, ge=1)
115
+ heartbeat_interval: Optional[int] = Field(None, ge=10, le=300)
116
+ tags: Optional[List[str]] = None
117
+ settings: Optional[dict] = None
118
+
119
+
120
+ class WorkerQueueResponse(BaseModel):
121
+ id: str
122
+ organization_id: str
123
+ environment_id: str
124
+ name: str
125
+ display_name: Optional[str]
126
+ description: Optional[str]
127
+ status: str
128
+ max_workers: Optional[int]
129
+ heartbeat_interval: int
130
+ tags: List[str]
131
+ settings: dict
132
+ created_at: str
133
+ updated_at: str
134
+ created_by: Optional[str]
135
+ # Computed
136
+ active_workers: int = 0
137
+ task_queue_name: str # Full task queue name: org.env.worker_queue
138
+
139
+
140
+ @router.get("/worker-queues", response_model=List[WorkerQueueResponse])
141
+ async def list_all_worker_queues(
142
+ request: Request,
143
+ organization: dict = Depends(get_current_organization),
144
+ ):
145
+ """List all worker queues across all environments for the organization"""
146
+ try:
147
+ client = get_supabase()
148
+ org_id = organization["id"]
149
+
150
+ # Get all worker queues for this organization
151
+ result = (
152
+ client.table("worker_queues")
153
+ .select("*, environments(name)")
154
+ .eq("organization_id", org_id)
155
+ .order("created_at", desc=False)
156
+ .execute()
157
+ )
158
+
159
+ if not result.data:
160
+ return []
161
+
162
+ # Get active workers from Redis (with automatic TTL-based expiration)
163
+ active_workers = await get_active_workers_from_redis(org_id)
164
+
165
+ # Count workers per queue
166
+ worker_counts = {}
167
+ for worker_id, worker_data in active_workers.items():
168
+ queue_id = worker_data.get("worker_queue_id")
169
+ if queue_id:
170
+ worker_counts[queue_id] = worker_counts.get(queue_id, 0) + 1
171
+
172
+ # Build response
173
+ queues = []
174
+ for queue in result.data:
175
+ # Use queue UUID as task queue name for security
176
+ task_queue_name = queue["id"]
177
+ active_worker_count = worker_counts.get(queue["id"], 0)
178
+
179
+ # Get environment name from join
180
+ env_data = queue.get("environments")
181
+ environment_name = env_data.get("name") if env_data else None
182
+
183
+ queue_copy = dict(queue)
184
+ queue_copy.pop("environments", None) # Remove join data
185
+
186
+ queues.append(
187
+ WorkerQueueResponse(
188
+ **queue_copy,
189
+ active_workers=active_worker_count,
190
+ task_queue_name=task_queue_name,
191
+ environment_name=environment_name,
192
+ )
193
+ )
194
+
195
+ logger.info(
196
+ "all_worker_queues_listed",
197
+ count=len(queues),
198
+ org_id=org_id,
199
+ )
200
+
201
+ return queues
202
+
203
+ except HTTPException:
204
+ raise
205
+ except Exception as e:
206
+ logger.error("all_worker_queues_list_failed", error=str(e), org_id=org_id)
207
+ raise HTTPException(
208
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
209
+ detail=f"Failed to list all worker queues: {str(e)}"
210
+ )
211
+
212
+
213
+ @router.post("/environments/{environment_id}/worker-queues", response_model=WorkerQueueResponse, status_code=status.HTTP_201_CREATED)
214
+ async def create_worker_queue(
215
+ environment_id: str,
216
+ queue_data: WorkerQueueCreate,
217
+ request: Request,
218
+ organization: dict = Depends(get_current_organization),
219
+ ):
220
+ """Create a new worker queue within an environment"""
221
+ try:
222
+ client = get_supabase()
223
+ org_id = organization["id"]
224
+
225
+ # Validate environment exists
226
+ env_result = (
227
+ client.table("environments")
228
+ .select("id, name")
229
+ .eq("id", environment_id)
230
+ .eq("organization_id", org_id)
231
+ .single()
232
+ .execute()
233
+ )
234
+
235
+ if not env_result.data:
236
+ raise HTTPException(
237
+ status_code=status.HTTP_404_NOT_FOUND,
238
+ detail="Environment not found"
239
+ )
240
+
241
+ environment = env_result.data
242
+
243
+ # Check if worker queue name already exists in this environment
244
+ existing = (
245
+ client.table("worker_queues")
246
+ .select("id")
247
+ .eq("environment_id", environment_id)
248
+ .eq("name", queue_data.name)
249
+ .execute()
250
+ )
251
+
252
+ if existing.data:
253
+ raise HTTPException(
254
+ status_code=status.HTTP_409_CONFLICT,
255
+ detail=f"Worker queue '{queue_data.name}' already exists in this environment"
256
+ )
257
+
258
+ # Create worker queue
259
+ queue_id = str(uuid.uuid4())
260
+ now = datetime.utcnow().isoformat()
261
+
262
+ queue_record = {
263
+ "id": queue_id,
264
+ "organization_id": org_id,
265
+ "environment_id": environment_id,
266
+ "name": queue_data.name,
267
+ "display_name": queue_data.display_name or queue_data.name,
268
+ "description": queue_data.description,
269
+ "status": "active",
270
+ "max_workers": queue_data.max_workers,
271
+ "heartbeat_interval": queue_data.heartbeat_interval,
272
+ "tags": queue_data.tags,
273
+ "settings": queue_data.settings,
274
+ "created_at": now,
275
+ "updated_at": now,
276
+ "created_by": organization.get("user_id"),
277
+ }
278
+
279
+ result = client.table("worker_queues").insert(queue_record).execute()
280
+
281
+ if not result.data:
282
+ raise HTTPException(
283
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
284
+ detail="Failed to create worker queue"
285
+ )
286
+
287
+ queue = result.data[0]
288
+
289
+ # Use queue UUID as task queue name for security (unpredictable)
290
+ task_queue_name = queue_id
291
+
292
+ logger.info(
293
+ "worker_queue_created",
294
+ queue_id=queue_id,
295
+ queue_name=queue["name"],
296
+ environment_id=environment_id,
297
+ task_queue_name=task_queue_name,
298
+ org_id=org_id,
299
+ )
300
+
301
+ return WorkerQueueResponse(
302
+ **queue,
303
+ active_workers=0,
304
+ task_queue_name=task_queue_name,
305
+ )
306
+
307
+ except HTTPException:
308
+ raise
309
+ except Exception as e:
310
+ logger.error("worker_queue_creation_failed", error=str(e), org_id=organization["id"])
311
+ raise HTTPException(
312
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
313
+ detail=f"Failed to create worker queue: {str(e)}"
314
+ )
315
+
316
+
317
+ @router.get("/environments/{environment_id}/worker-queues", response_model=List[WorkerQueueResponse])
318
+ async def list_worker_queues(
319
+ environment_id: str,
320
+ request: Request,
321
+ organization: dict = Depends(get_current_organization),
322
+ ):
323
+ """List all worker queues in an environment"""
324
+ try:
325
+ client = get_supabase()
326
+ org_id = organization["id"]
327
+
328
+ # Get environment name
329
+ env_result = (
330
+ client.table("environments")
331
+ .select("name")
332
+ .eq("id", environment_id)
333
+ .eq("organization_id", org_id)
334
+ .single()
335
+ .execute()
336
+ )
337
+
338
+ if not env_result.data:
339
+ raise HTTPException(
340
+ status_code=status.HTTP_404_NOT_FOUND,
341
+ detail="Environment not found"
342
+ )
343
+
344
+ environment_name = env_result.data["name"]
345
+
346
+ # Get worker queues
347
+ result = (
348
+ client.table("worker_queues")
349
+ .select("*")
350
+ .eq("environment_id", environment_id)
351
+ .order("created_at", desc=False)
352
+ .execute()
353
+ )
354
+
355
+ if not result.data:
356
+ return []
357
+
358
+ # Get active workers from Redis (with automatic TTL-based expiration)
359
+ active_workers = await get_active_workers_from_redis(org_id)
360
+
361
+ # Count workers per queue
362
+ worker_counts = {}
363
+ for worker_id, worker_data in active_workers.items():
364
+ queue_id = worker_data.get("worker_queue_id")
365
+ if queue_id:
366
+ worker_counts[queue_id] = worker_counts.get(queue_id, 0) + 1
367
+
368
+ # Build response
369
+ queues = []
370
+ for queue in result.data:
371
+ # Use queue UUID as task queue name for security
372
+ task_queue_name = queue["id"]
373
+ active_workers = worker_counts.get(queue["id"], 0)
374
+
375
+ queues.append(
376
+ WorkerQueueResponse(
377
+ **queue,
378
+ active_workers=active_workers,
379
+ task_queue_name=task_queue_name,
380
+ )
381
+ )
382
+
383
+ logger.info(
384
+ "worker_queues_listed",
385
+ count=len(queues),
386
+ environment_id=environment_id,
387
+ org_id=org_id,
388
+ )
389
+
390
+ return queues
391
+
392
+ except HTTPException:
393
+ raise
394
+ except Exception as e:
395
+ logger.error("worker_queues_list_failed", error=str(e), environment_id=environment_id)
396
+ raise HTTPException(
397
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
398
+ detail=f"Failed to list worker queues: {str(e)}"
399
+ )
400
+
401
+
402
+ @router.get("/worker-queues/{queue_id}", response_model=WorkerQueueResponse)
403
+ async def get_worker_queue(
404
+ queue_id: str,
405
+ request: Request,
406
+ organization: dict = Depends(get_current_organization),
407
+ ):
408
+ """Get a specific worker queue by ID"""
409
+ try:
410
+ client = get_supabase()
411
+ org_id = organization["id"]
412
+
413
+ # Get worker queue
414
+ result = (
415
+ client.table("worker_queues")
416
+ .select("*")
417
+ .eq("id", queue_id)
418
+ .eq("organization_id", org_id)
419
+ .single()
420
+ .execute()
421
+ )
422
+
423
+ if not result.data:
424
+ raise HTTPException(
425
+ status_code=status.HTTP_404_NOT_FOUND,
426
+ detail="Worker queue not found"
427
+ )
428
+
429
+ queue = result.data
430
+
431
+ # Get environment name separately
432
+ environment_name = "unknown"
433
+ if queue.get("environment_id"):
434
+ env_result = (
435
+ client.table("environments")
436
+ .select("name")
437
+ .eq("id", queue["environment_id"])
438
+ .eq("organization_id", org_id)
439
+ .maybe_single()
440
+ .execute()
441
+ )
442
+ if env_result.data:
443
+ environment_name = env_result.data["name"]
444
+
445
+ # Get active workers from Redis for this specific queue
446
+ active_workers_dict = await get_active_workers_from_redis(org_id, queue_id)
447
+ active_workers = len(active_workers_dict)
448
+
449
+ # Remove joined data
450
+ queue.pop("environments", None)
451
+
452
+ # Use queue UUID as task queue name for security
453
+ task_queue_name = queue_id
454
+
455
+ return WorkerQueueResponse(
456
+ **queue,
457
+ active_workers=active_workers,
458
+ task_queue_name=task_queue_name,
459
+ )
460
+
461
+ except HTTPException:
462
+ raise
463
+ except Exception as e:
464
+ logger.error("worker_queue_get_failed", error=str(e), queue_id=queue_id)
465
+ raise HTTPException(
466
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
467
+ detail=f"Failed to get worker queue: {str(e)}"
468
+ )
469
+
470
+
471
+ @router.patch("/worker-queues/{queue_id}", response_model=WorkerQueueResponse)
472
+ async def update_worker_queue(
473
+ queue_id: str,
474
+ queue_data: WorkerQueueUpdate,
475
+ request: Request,
476
+ organization: dict = Depends(get_current_organization),
477
+ ):
478
+ """Update a worker queue"""
479
+ try:
480
+ client = get_supabase()
481
+ org_id = organization["id"]
482
+
483
+ # Check if queue exists
484
+ existing = (
485
+ client.table("worker_queues")
486
+ .select("id, environment_id")
487
+ .eq("id", queue_id)
488
+ .eq("organization_id", org_id)
489
+ .single()
490
+ .execute()
491
+ )
492
+
493
+ if not existing.data:
494
+ raise HTTPException(
495
+ status_code=status.HTTP_404_NOT_FOUND,
496
+ detail="Worker queue not found"
497
+ )
498
+
499
+ # Build update dict
500
+ update_data = queue_data.model_dump(exclude_unset=True)
501
+ update_data["updated_at"] = datetime.utcnow().isoformat()
502
+
503
+ # Update queue
504
+ result = (
505
+ client.table("worker_queues")
506
+ .update(update_data)
507
+ .eq("id", queue_id)
508
+ .eq("organization_id", org_id)
509
+ .execute()
510
+ )
511
+
512
+ if not result.data:
513
+ raise HTTPException(
514
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
515
+ detail="Failed to update worker queue"
516
+ )
517
+
518
+ queue = result.data[0]
519
+
520
+ # Get environment name and active workers
521
+ env_result = (
522
+ client.table("environments")
523
+ .select("name")
524
+ .eq("id", queue["environment_id"])
525
+ .single()
526
+ .execute()
527
+ )
528
+
529
+ environment_name = env_result.data["name"] if env_result.data else "unknown"
530
+
531
+ workers_result = (
532
+ client.table("worker_heartbeats")
533
+ .select("id")
534
+ .eq("worker_queue_id", queue_id)
535
+ .in_("status", ["active", "idle", "busy"])
536
+ .execute()
537
+ )
538
+
539
+ active_workers = len(workers_result.data or [])
540
+ # Use queue UUID as task queue name for security
541
+ task_queue_name = queue_id
542
+
543
+ logger.info(
544
+ "worker_queue_updated",
545
+ queue_id=queue_id,
546
+ org_id=org_id,
547
+ )
548
+
549
+ return WorkerQueueResponse(
550
+ **queue,
551
+ active_workers=active_workers,
552
+ task_queue_name=task_queue_name,
553
+ )
554
+
555
+ except HTTPException:
556
+ raise
557
+ except Exception as e:
558
+ logger.error("worker_queue_update_failed", error=str(e), queue_id=queue_id)
559
+ raise HTTPException(
560
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
561
+ detail=f"Failed to update worker queue: {str(e)}"
562
+ )
563
+
564
+
565
+ @router.delete("/worker-queues/{queue_id}", status_code=status.HTTP_204_NO_CONTENT)
566
+ async def delete_worker_queue(
567
+ queue_id: str,
568
+ request: Request,
569
+ organization: dict = Depends(get_current_organization),
570
+ ):
571
+ """Delete a worker queue"""
572
+ try:
573
+ client = get_supabase()
574
+ org_id = organization["id"]
575
+
576
+ # Prevent deleting default queue
577
+ queue_check = (
578
+ client.table("worker_queues")
579
+ .select("name")
580
+ .eq("id", queue_id)
581
+ .eq("organization_id", org_id)
582
+ .single()
583
+ .execute()
584
+ )
585
+
586
+ if queue_check.data and queue_check.data.get("name") == "default":
587
+ raise HTTPException(
588
+ status_code=status.HTTP_400_BAD_REQUEST,
589
+ detail="Cannot delete the default worker queue"
590
+ )
591
+
592
+ # Check for active workers in Redis
593
+ active_workers = await get_active_workers_from_redis(org_id, queue_id)
594
+
595
+ if active_workers:
596
+ raise HTTPException(
597
+ status_code=status.HTTP_400_BAD_REQUEST,
598
+ detail=f"Cannot delete worker queue with {len(active_workers)} active workers"
599
+ )
600
+
601
+ # Delete queue
602
+ result = (
603
+ client.table("worker_queues")
604
+ .delete()
605
+ .eq("id", queue_id)
606
+ .eq("organization_id", org_id)
607
+ .execute()
608
+ )
609
+
610
+ if not result.data:
611
+ raise HTTPException(
612
+ status_code=status.HTTP_404_NOT_FOUND,
613
+ detail="Worker queue not found"
614
+ )
615
+
616
+ logger.info("worker_queue_deleted", queue_id=queue_id, org_id=org_id)
617
+
618
+ return None
619
+
620
+ except HTTPException:
621
+ raise
622
+ except Exception as e:
623
+ logger.error("worker_queue_delete_failed", error=str(e), queue_id=queue_id)
624
+ raise HTTPException(
625
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
626
+ detail=f"Failed to delete worker queue: {str(e)}"
627
+ )
628
+
629
+
630
+ @router.get("/worker-queues/{queue_id}/install-script")
631
+ async def get_installation_script(
632
+ queue_id: str,
633
+ deployment_type: Literal["docker", "kubernetes", "openshift", "local"] = "local",
634
+ request: Request = None,
635
+ organization: dict = Depends(get_current_organization),
636
+ ):
637
+ """
638
+ Generate an installation script for setting up a worker for this queue.
639
+
640
+ Supports multiple deployment types:
641
+ - local: Python virtual environment setup
642
+ - docker: Docker run command
643
+ - kubernetes: Kubernetes deployment YAML
644
+ - openshift: OpenShift deployment YAML
645
+ """
646
+ try:
647
+ client = get_supabase()
648
+ org_id = organization["id"]
649
+
650
+ # Get worker queue details
651
+ result = (
652
+ client.table("worker_queues")
653
+ .select("*")
654
+ .eq("id", queue_id)
655
+ .eq("organization_id", org_id)
656
+ .single()
657
+ .execute()
658
+ )
659
+
660
+ if not result.data:
661
+ raise HTTPException(
662
+ status_code=status.HTTP_404_NOT_FOUND,
663
+ detail="Worker queue not found"
664
+ )
665
+
666
+ queue = result.data
667
+
668
+ # Get environment name separately
669
+ environment_name = "default"
670
+ if queue.get("environment_id"):
671
+ env_result = (
672
+ client.table("environments")
673
+ .select("name")
674
+ .eq("id", queue["environment_id"])
675
+ .eq("organization_id", org_id)
676
+ .maybe_single()
677
+ .execute()
678
+ )
679
+ if env_result.data:
680
+ environment_name = env_result.data["name"]
681
+ queue_name = queue["name"]
682
+
683
+ # Get control plane URL
684
+ control_plane_url = os.getenv("CONTROL_PLANE_URL", "https://agent-control-plane.vercel.app")
685
+
686
+ # Generate new worker ID
687
+ worker_id = str(uuid.uuid4())
688
+
689
+ # Generate script based on deployment type
690
+ if deployment_type == "local":
691
+ script = _generate_local_script(worker_id, control_plane_url)
692
+ elif deployment_type == "docker":
693
+ script = _generate_docker_script(worker_id, control_plane_url, queue_name, environment_name)
694
+ elif deployment_type == "kubernetes":
695
+ script = _generate_kubernetes_script(worker_id, control_plane_url, queue_name, environment_name)
696
+ elif deployment_type == "openshift":
697
+ script = _generate_openshift_script(worker_id, control_plane_url, queue_name, environment_name)
698
+ else:
699
+ raise HTTPException(
700
+ status_code=status.HTTP_400_BAD_REQUEST,
701
+ detail=f"Unsupported deployment type: {deployment_type}"
702
+ )
703
+
704
+ logger.info(
705
+ "installation_script_generated",
706
+ queue_id=queue_id,
707
+ deployment_type=deployment_type,
708
+ worker_id=worker_id,
709
+ org_id=org_id,
710
+ )
711
+
712
+ return PlainTextResponse(content=script, media_type="text/plain")
713
+
714
+ except HTTPException:
715
+ raise
716
+ except Exception as e:
717
+ logger.error("installation_script_generation_failed", error=str(e), queue_id=queue_id)
718
+ raise HTTPException(
719
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
720
+ detail=f"Failed to generate installation script: {str(e)}"
721
+ )
722
+
723
+
724
+ class WorkerStartResponse(BaseModel):
725
+ """Worker start configuration"""
726
+ worker_id: str
727
+ task_queue_name: str # The queue UUID
728
+ temporal_namespace: str
729
+ temporal_host: str
730
+ temporal_api_key: str
731
+ organization_id: str
732
+ control_plane_url: str
733
+ heartbeat_interval: int
734
+ # LiteLLM configuration for agno workflows/activities
735
+ litellm_api_url: str
736
+ litellm_api_key: str
737
+ # Queue metadata
738
+ queue_name: str
739
+ environment_name: str
740
+
741
+
742
+ @router.post("/worker-queues/{queue_id}/start", response_model=WorkerStartResponse)
743
+ async def start_worker_for_queue(
744
+ queue_id: str,
745
+ request: Request,
746
+ organization: dict = Depends(get_current_organization),
747
+ ):
748
+ """
749
+ Start a worker for a specific queue.
750
+
751
+ This endpoint is called by the CLI with: kubiya worker start --queue-id={queue_id}
752
+
753
+ Returns all configuration needed for the worker to connect to Temporal.
754
+ """
755
+ try:
756
+ client = get_supabase()
757
+ org_id = organization["id"]
758
+
759
+ # Get worker queue - use maybe_single to avoid exception on missing rows
760
+ try:
761
+ result = (
762
+ client.table("worker_queues")
763
+ .select("*")
764
+ .eq("id", queue_id)
765
+ .eq("organization_id", org_id)
766
+ .maybe_single()
767
+ .execute()
768
+ )
769
+ except Exception as db_error:
770
+ # Handle postgrest 204 No Content response (queue not found)
771
+ error_str = str(db_error)
772
+ if "'code': '204'" in error_str or "Missing response" in error_str:
773
+ # Treat 204 as "no data found" rather than an error
774
+ result = type('obj', (object,), {'data': None})()
775
+ else:
776
+ logger.error("database_query_failed", error=str(db_error), queue_id=queue_id)
777
+ raise HTTPException(
778
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
779
+ detail=f"Database query failed. Please contact support."
780
+ ) from db_error
781
+
782
+ if not result or not result.data:
783
+ # Check if queue exists at all (might be in different org)
784
+ check_result = (
785
+ client.table("worker_queues")
786
+ .select("id, organization_id")
787
+ .eq("id", queue_id)
788
+ .maybe_single()
789
+ .execute()
790
+ )
791
+
792
+ if check_result and check_result.data:
793
+ raise HTTPException(
794
+ status_code=status.HTTP_403_FORBIDDEN,
795
+ detail=f"Worker queue '{queue_id}' not found in your organization"
796
+ )
797
+ else:
798
+ raise HTTPException(
799
+ status_code=status.HTTP_404_NOT_FOUND,
800
+ detail=f"Worker queue '{queue_id}' does not exist. Please create a queue from the UI first."
801
+ )
802
+
803
+ queue = result.data
804
+
805
+ # Get environment/task_queue separately
806
+ if not queue.get("environment_id"):
807
+ raise HTTPException(
808
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
809
+ detail=f"Worker queue '{queue.get('name', queue_id)}' has no environment configured. Please contact support."
810
+ )
811
+
812
+ env_result = (
813
+ client.table("environments")
814
+ .select("name")
815
+ .eq("id", queue["environment_id"])
816
+ .eq("organization_id", org_id)
817
+ .maybe_single()
818
+ .execute()
819
+ )
820
+
821
+ if not env_result or not env_result.data:
822
+ raise HTTPException(
823
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
824
+ detail=f"Environment configuration error for queue '{queue.get('name', queue_id)}'. Please contact support."
825
+ )
826
+
827
+ environment_name = env_result.data["name"]
828
+
829
+ # Check if queue is active
830
+ if queue.get("status") != "active":
831
+ raise HTTPException(
832
+ status_code=status.HTTP_400_BAD_REQUEST,
833
+ detail=f"Worker queue is not active (status: {queue.get('status')})"
834
+ )
835
+
836
+ # TEMPORARY: Use fixed namespace + admin API key
837
+ import os
838
+ namespace = {
839
+ "namespace_name": "agent-control-plane.lpagu",
840
+ "api_key_encrypted": os.getenv("TEMPORAL_CLOUD_ADMIN_TOKEN", ""),
841
+ }
842
+
843
+ # Generate worker ID
844
+ worker_id = str(uuid.uuid4())
845
+
846
+ # Create worker heartbeat record
847
+ now = datetime.utcnow().isoformat()
848
+ worker_record = {
849
+ "id": worker_id,
850
+ "worker_id": worker_id,
851
+ "organization_id": org_id,
852
+ "worker_queue_id": queue_id,
853
+ "environment_name": environment_name,
854
+ "status": "active",
855
+ "tasks_processed": 0,
856
+ "registered_at": now,
857
+ "last_heartbeat": now,
858
+ "updated_at": now,
859
+ "worker_metadata": {},
860
+ }
861
+
862
+ client.table("worker_heartbeats").insert(worker_record).execute()
863
+
864
+ # Get control plane URL
865
+ control_plane_url = os.getenv("CONTROL_PLANE_URL", "https://agent-control-plane.vercel.app")
866
+ temporal_host = os.getenv("TEMPORAL_HOST", "us-east-1.aws.api.temporal.io:7233")
867
+
868
+ # Get LiteLLM configuration for agno workflows/activities
869
+ litellm_api_url = os.getenv("LITELLM_API_URL", "https://llm-proxy.kubiya.ai")
870
+ litellm_api_key = os.getenv("LITELLM_API_KEY", "")
871
+
872
+ # Task queue name is just the queue UUID for security
873
+ task_queue_name = queue_id
874
+
875
+ logger.info(
876
+ "worker_started_for_queue",
877
+ worker_id=worker_id,
878
+ queue_id=queue_id,
879
+ task_queue_name=task_queue_name,
880
+ org_id=org_id,
881
+ )
882
+
883
+ return WorkerStartResponse(
884
+ worker_id=worker_id,
885
+ task_queue_name=task_queue_name,
886
+ temporal_namespace=namespace["namespace_name"],
887
+ temporal_host=temporal_host,
888
+ temporal_api_key=namespace["api_key_encrypted"],
889
+ organization_id=org_id,
890
+ control_plane_url=control_plane_url,
891
+ heartbeat_interval=queue.get("heartbeat_interval", 30),
892
+ litellm_api_url=litellm_api_url,
893
+ litellm_api_key=litellm_api_key,
894
+ queue_name=queue["name"],
895
+ environment_name=environment_name,
896
+ )
897
+
898
+ except HTTPException:
899
+ raise
900
+ except Exception as e:
901
+ logger.error(
902
+ "worker_start_for_queue_failed",
903
+ error=str(e),
904
+ error_type=type(e).__name__,
905
+ queue_id=queue_id,
906
+ org_id=organization.get("id")
907
+ )
908
+ raise HTTPException(
909
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
910
+ detail=f"Failed to start worker due to an internal error. Please try again or contact support. (Error ID: {queue_id[:8]})"
911
+ )
912
+
913
+
914
+ def _generate_local_script(worker_id: str, control_plane_url: str) -> str:
915
+ """Generate a bash script for local Python installation"""
916
+ return f"""#!/bin/bash
917
+ # Kubiya Agent Worker - Local Installation Script
918
+ # Generated: {datetime.utcnow().isoformat()}
919
+
920
+ set -e
921
+
922
+ echo "🚀 Setting up Kubiya Agent Worker..."
923
+ echo ""
924
+
925
+ # Configuration
926
+ WORKER_ID="{worker_id}"
927
+ CONTROL_PLANE_URL="{control_plane_url}"
928
+
929
+ # Check if KUBIYA_API_KEY is set
930
+ if [ -z "$KUBIYA_API_KEY" ]; then
931
+ echo "❌ Error: KUBIYA_API_KEY environment variable is not set"
932
+ echo "Please set it with: export KUBIYA_API_KEY=your-api-key"
933
+ exit 1
934
+ fi
935
+
936
+ # Check Python version
937
+ if ! command -v python3 &> /dev/null; then
938
+ echo "❌ Error: Python 3 is not installed"
939
+ exit 1
940
+ fi
941
+
942
+ PYTHON_VERSION=$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2)
943
+ echo "✓ Found Python $PYTHON_VERSION"
944
+
945
+ # Create directory
946
+ WORKER_DIR="$HOME/.kubiya/workers/$WORKER_ID"
947
+ mkdir -p "$WORKER_DIR"
948
+ cd "$WORKER_DIR"
949
+
950
+ echo "✓ Created worker directory: $WORKER_DIR"
951
+
952
+ # Create virtual environment
953
+ echo "📦 Creating virtual environment..."
954
+ python3 -m venv venv
955
+ source venv/bin/activate
956
+
957
+ # Install dependencies
958
+ echo "📦 Installing dependencies..."
959
+ pip install --quiet --upgrade pip
960
+ pip install --quiet \\
961
+ temporalio>=1.5.0 \\
962
+ httpx>=0.27.0 \\
963
+ structlog>=24.1.0 \\
964
+ psutil>=5.9.0 \\
965
+ agno-sdk>=0.1.0 \\
966
+ litellm>=1.35.0
967
+
968
+ echo "✓ Dependencies installed"
969
+
970
+ # Download worker script
971
+ echo "📥 Downloading worker script..."
972
+ curl -s -o worker.py https://raw.githubusercontent.com/kubiya-sandbox/orchestrator/main/agent-worker/worker.py
973
+
974
+ echo "✓ Worker script downloaded"
975
+
976
+ # Create systemd service file (optional)
977
+ cat > kubiya-worker.service <<EOF
978
+ [Unit]
979
+ Description=Kubiya Agent Worker
980
+ After=network.target
981
+
982
+ [Service]
983
+ Type=simple
984
+ User=$USER
985
+ WorkingDirectory=$WORKER_DIR
986
+ Environment="WORKER_ID=$WORKER_ID"
987
+ Environment="KUBIYA_API_KEY=$KUBIYA_API_KEY"
988
+ Environment="CONTROL_PLANE_URL=$CONTROL_PLANE_URL"
989
+ ExecStart=$WORKER_DIR/venv/bin/python $WORKER_DIR/worker.py
990
+ Restart=always
991
+ RestartSec=10
992
+
993
+ [Install]
994
+ WantedBy=multi-user.target
995
+ EOF
996
+
997
+ echo "✓ Systemd service file created (optional)"
998
+
999
+ # Create run script
1000
+ cat > run.sh <<EOF
1001
+ #!/bin/bash
1002
+ cd "$WORKER_DIR"
1003
+ source venv/bin/activate
1004
+ export WORKER_ID="$WORKER_ID"
1005
+ export KUBIYA_API_KEY="$KUBIYA_API_KEY"
1006
+ export CONTROL_PLANE_URL="$CONTROL_PLANE_URL"
1007
+ python worker.py
1008
+ EOF
1009
+
1010
+ chmod +x run.sh
1011
+
1012
+ echo ""
1013
+ echo "✅ Installation complete!"
1014
+ echo ""
1015
+ echo "To start the worker:"
1016
+ echo " cd $WORKER_DIR && ./run.sh"
1017
+ echo ""
1018
+ echo "Or to install as a systemd service:"
1019
+ echo " sudo cp $WORKER_DIR/kubiya-worker.service /etc/systemd/system/"
1020
+ echo " sudo systemctl daemon-reload"
1021
+ echo " sudo systemctl enable kubiya-worker"
1022
+ echo " sudo systemctl start kubiya-worker"
1023
+ echo ""
1024
+ """
1025
+
1026
+
1027
+ def _generate_docker_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
1028
+ """Generate Docker commands for running the worker"""
1029
+ return f"""# Kubiya Agent Worker - Docker Installation
1030
+ # Generated: {datetime.utcnow().isoformat()}
1031
+
1032
+ # Configuration
1033
+ WORKER_ID="{worker_id}"
1034
+ CONTROL_PLANE_URL="{control_plane_url}"
1035
+ QUEUE_NAME="{queue_name}"
1036
+ ENVIRONMENT_NAME="{environment_name}"
1037
+
1038
+ # Make sure to set your API key
1039
+ # export KUBIYA_API_KEY=your-api-key
1040
+
1041
+ # Run with Docker
1042
+ docker run -d \\
1043
+ --name kubiya-worker-{queue_name}-{worker_id[:8]} \\
1044
+ --restart unless-stopped \\
1045
+ -e WORKER_ID="$WORKER_ID" \\
1046
+ -e KUBIYA_API_KEY="$KUBIYA_API_KEY" \\
1047
+ -e CONTROL_PLANE_URL="$CONTROL_PLANE_URL" \\
1048
+ -e LOG_LEVEL="INFO" \\
1049
+ kubiya/agent-worker:latest
1050
+
1051
+ # Check logs
1052
+ # docker logs -f kubiya-worker-{queue_name}-{worker_id[:8]}
1053
+
1054
+ # Stop worker
1055
+ # docker stop kubiya-worker-{queue_name}-{worker_id[:8]}
1056
+
1057
+ # Remove worker
1058
+ # docker rm kubiya-worker-{queue_name}-{worker_id[:8]}
1059
+
1060
+ # Docker Compose (save as docker-compose.yml)
1061
+ cat > docker-compose.yml <<EOF
1062
+ version: '3.8'
1063
+
1064
+ services:
1065
+ worker:
1066
+ image: kubiya/agent-worker:latest
1067
+ container_name: kubiya-worker-{queue_name}
1068
+ restart: unless-stopped
1069
+ environment:
1070
+ - WORKER_ID={worker_id}
1071
+ - KUBIYA_API_KEY=${{KUBIYA_API_KEY}}
1072
+ - CONTROL_PLANE_URL={control_plane_url}
1073
+ - LOG_LEVEL=INFO
1074
+ healthcheck:
1075
+ test: ["CMD", "python", "-c", "import httpx; httpx.get('{control_plane_url}/health')"]
1076
+ interval: 30s
1077
+ timeout: 10s
1078
+ retries: 3
1079
+ start_period: 10s
1080
+ EOF
1081
+
1082
+ # To use docker-compose:
1083
+ # docker-compose up -d
1084
+ """
1085
+
1086
+
1087
+ def _generate_kubernetes_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
1088
+ """Generate Kubernetes deployment YAML"""
1089
+ return f"""# Kubiya Agent Worker - Kubernetes Deployment
1090
+ # Generated: {datetime.utcnow().isoformat()}
1091
+ #
1092
+ # To deploy:
1093
+ # 1. Create secret: kubectl create secret generic kubiya-worker-secret --from-literal=api-key=YOUR_API_KEY
1094
+ # 2. Apply this file: kubectl apply -f kubiya-worker.yaml
1095
+ #
1096
+ ---
1097
+ apiVersion: v1
1098
+ kind: ConfigMap
1099
+ metadata:
1100
+ name: kubiya-worker-{queue_name}-config
1101
+ labels:
1102
+ app: kubiya-worker
1103
+ queue: {queue_name}
1104
+ environment: {environment_name}
1105
+ data:
1106
+ WORKER_ID: "{worker_id}"
1107
+ CONTROL_PLANE_URL: "{control_plane_url}"
1108
+ LOG_LEVEL: "INFO"
1109
+
1110
+ ---
1111
+ apiVersion: apps/v1
1112
+ kind: Deployment
1113
+ metadata:
1114
+ name: kubiya-worker-{queue_name}
1115
+ labels:
1116
+ app: kubiya-worker
1117
+ queue: {queue_name}
1118
+ environment: {environment_name}
1119
+ spec:
1120
+ replicas: 1
1121
+ selector:
1122
+ matchLabels:
1123
+ app: kubiya-worker
1124
+ queue: {queue_name}
1125
+ template:
1126
+ metadata:
1127
+ labels:
1128
+ app: kubiya-worker
1129
+ queue: {queue_name}
1130
+ environment: {environment_name}
1131
+ spec:
1132
+ containers:
1133
+ - name: worker
1134
+ image: kubiya/agent-worker:latest
1135
+ imagePullPolicy: Always
1136
+ envFrom:
1137
+ - configMapRef:
1138
+ name: kubiya-worker-{queue_name}-config
1139
+ env:
1140
+ - name: KUBIYA_API_KEY
1141
+ valueFrom:
1142
+ secretKeyRef:
1143
+ name: kubiya-worker-secret
1144
+ key: api-key
1145
+ resources:
1146
+ requests:
1147
+ memory: "512Mi"
1148
+ cpu: "250m"
1149
+ limits:
1150
+ memory: "2Gi"
1151
+ cpu: "1000m"
1152
+ livenessProbe:
1153
+ httpGet:
1154
+ path: /health
1155
+ port: 8080
1156
+ initialDelaySeconds: 30
1157
+ periodSeconds: 30
1158
+ timeoutSeconds: 10
1159
+ failureThreshold: 3
1160
+ readinessProbe:
1161
+ httpGet:
1162
+ path: /health
1163
+ port: 8080
1164
+ initialDelaySeconds: 10
1165
+ periodSeconds: 10
1166
+ timeoutSeconds: 5
1167
+ failureThreshold: 3
1168
+ restartPolicy: Always
1169
+
1170
+ ---
1171
+ apiVersion: v1
1172
+ kind: Service
1173
+ metadata:
1174
+ name: kubiya-worker-{queue_name}
1175
+ labels:
1176
+ app: kubiya-worker
1177
+ queue: {queue_name}
1178
+ spec:
1179
+ selector:
1180
+ app: kubiya-worker
1181
+ queue: {queue_name}
1182
+ ports:
1183
+ - protocol: TCP
1184
+ port: 8080
1185
+ targetPort: 8080
1186
+ type: ClusterIP
1187
+
1188
+ ---
1189
+ # Optional: HorizontalPodAutoscaler
1190
+ # apiVersion: autoscaling/v2
1191
+ # kind: HorizontalPodAutoscaler
1192
+ # metadata:
1193
+ # name: kubiya-worker-{queue_name}
1194
+ # spec:
1195
+ # scaleTargetRef:
1196
+ # apiVersion: apps/v1
1197
+ # kind: Deployment
1198
+ # name: kubiya-worker-{queue_name}
1199
+ # minReplicas: 1
1200
+ # maxReplicas: 10
1201
+ # metrics:
1202
+ # - type: Resource
1203
+ # resource:
1204
+ # name: cpu
1205
+ # target:
1206
+ # type: Utilization
1207
+ # averageUtilization: 70
1208
+ """
1209
+
1210
+
1211
+ class WorkerQueueCommandResponse(BaseModel):
1212
+ """Worker queue connection command"""
1213
+ queue_id: str
1214
+ command: str
1215
+ command_parts: dict
1216
+ can_register: bool
1217
+ queue_status: str
1218
+ active_workers: int
1219
+ max_workers: Optional[int]
1220
+
1221
+
1222
+ class WorkerSystemInfo(BaseModel):
1223
+ """Worker system information"""
1224
+ hostname: Optional[str] = None
1225
+ platform: Optional[str] = None
1226
+ os_name: Optional[str] = None
1227
+ os_version: Optional[str] = None
1228
+ python_version: Optional[str] = None
1229
+ cli_version: Optional[str] = None
1230
+ docker_available: Optional[bool] = None
1231
+ docker_version: Optional[str] = None
1232
+ cpu_count: Optional[int] = None
1233
+ cpu_percent: Optional[float] = None
1234
+ memory_total: Optional[int] = None
1235
+ memory_used: Optional[int] = None
1236
+ memory_percent: Optional[float] = None
1237
+ disk_total: Optional[int] = None
1238
+ disk_used: Optional[int] = None
1239
+ disk_percent: Optional[float] = None
1240
+ uptime_seconds: Optional[float] = None
1241
+
1242
+
1243
+ class WorkerDetail(BaseModel):
1244
+ """Individual worker details"""
1245
+ id: str
1246
+ worker_id: str
1247
+ status: str
1248
+ tasks_processed: int
1249
+ current_task_id: Optional[str]
1250
+ last_heartbeat: str
1251
+ registered_at: str
1252
+ system_info: Optional[WorkerSystemInfo] = None
1253
+ logs: Optional[List[str]] = None
1254
+ worker_metadata: dict
1255
+
1256
+
1257
+ @router.get("/worker-queues/{queue_id}/workers", response_model=List[WorkerDetail])
1258
+ async def list_queue_workers(
1259
+ queue_id: str,
1260
+ request: Request,
1261
+ organization: dict = Depends(get_current_organization),
1262
+ ):
1263
+ """
1264
+ List all workers for a specific queue with detailed information.
1265
+ """
1266
+ try:
1267
+ client = get_supabase()
1268
+ org_id = organization["id"]
1269
+
1270
+ # Get active workers from Redis for this queue
1271
+ active_workers = await get_active_workers_from_redis(org_id, queue_id)
1272
+
1273
+ # Get worker registration details from database (registered_at, worker_id)
1274
+ if active_workers:
1275
+ db_workers = (
1276
+ client.table("worker_heartbeats")
1277
+ .select("id, worker_id, registered_at")
1278
+ .eq("organization_id", org_id)
1279
+ .in_("id", list(active_workers.keys()))
1280
+ .execute()
1281
+ )
1282
+ db_workers_map = {w["id"]: w for w in (db_workers.data or [])}
1283
+ else:
1284
+ db_workers_map = {}
1285
+
1286
+ workers = []
1287
+ for worker_id, heartbeat_data in active_workers.items():
1288
+ # Get DB data for registration time
1289
+ db_data = db_workers_map.get(worker_id, {})
1290
+
1291
+ # Extract system info and logs from Redis heartbeat data
1292
+ metadata = heartbeat_data.get("metadata", {})
1293
+ system_info_data = heartbeat_data.get("system_info")
1294
+ logs = heartbeat_data.get("logs", [])
1295
+
1296
+ system_info = WorkerSystemInfo(**system_info_data) if system_info_data else None
1297
+
1298
+ workers.append(
1299
+ WorkerDetail(
1300
+ id=worker_id,
1301
+ worker_id=db_data.get("worker_id", worker_id),
1302
+ status=heartbeat_data.get("status", "unknown"),
1303
+ tasks_processed=heartbeat_data.get("tasks_processed", 0),
1304
+ current_task_id=heartbeat_data.get("current_task_id"),
1305
+ last_heartbeat=heartbeat_data.get("last_heartbeat", ""),
1306
+ registered_at=db_data.get("registered_at", ""),
1307
+ system_info=system_info,
1308
+ logs=logs,
1309
+ worker_metadata=metadata,
1310
+ )
1311
+ )
1312
+
1313
+ # Sort by last_heartbeat desc
1314
+ workers.sort(key=lambda w: w.last_heartbeat, reverse=True)
1315
+
1316
+ logger.info(
1317
+ "queue_workers_listed",
1318
+ queue_id=queue_id,
1319
+ worker_count=len(workers),
1320
+ org_id=org_id,
1321
+ )
1322
+
1323
+ return workers
1324
+
1325
+ except HTTPException:
1326
+ raise
1327
+ except Exception as e:
1328
+ logger.error("queue_workers_list_failed", error=str(e), queue_id=queue_id)
1329
+ raise HTTPException(
1330
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
1331
+ detail=f"Failed to list queue workers: {str(e)}"
1332
+ )
1333
+
1334
+
1335
+ @router.get("/worker-queues/{queue_id}/worker-command", response_model=WorkerQueueCommandResponse)
1336
+ async def get_worker_queue_command(
1337
+ queue_id: str,
1338
+ request: Request,
1339
+ organization: dict = Depends(get_current_organization),
1340
+ ):
1341
+ """
1342
+ Get the worker registration command for a specific worker queue.
1343
+
1344
+ Returns the kubiya worker start command with the queue ID that users
1345
+ should run to start a worker for this specific queue.
1346
+ """
1347
+ try:
1348
+ client = get_supabase()
1349
+ org_id = organization["id"]
1350
+
1351
+ # Get worker queue
1352
+ result = (
1353
+ client.table("worker_queues")
1354
+ .select("*")
1355
+ .eq("id", queue_id)
1356
+ .eq("organization_id", org_id)
1357
+ .single()
1358
+ .execute()
1359
+ )
1360
+
1361
+ if not result.data:
1362
+ raise HTTPException(status_code=404, detail="Worker queue not found")
1363
+
1364
+ queue = result.data
1365
+ queue_status = queue.get("status", "unknown")
1366
+
1367
+ # Check if queue is active
1368
+ can_register = queue_status == "active"
1369
+
1370
+ # Get active workers from Redis for this specific queue
1371
+ active_workers_dict = await get_active_workers_from_redis(org_id, queue_id)
1372
+ active_workers = len(active_workers_dict)
1373
+
1374
+ # Build command
1375
+ command = f"kubiya worker start --queue-id {queue_id}"
1376
+
1377
+ command_parts = {
1378
+ "binary": "kubiya",
1379
+ "subcommand": "worker start",
1380
+ "flags": {
1381
+ "--queue-id": queue_id,
1382
+ },
1383
+ }
1384
+
1385
+ logger.info(
1386
+ "worker_queue_command_retrieved",
1387
+ queue_id=queue_id,
1388
+ can_register=can_register,
1389
+ status=queue_status,
1390
+ active_workers=active_workers,
1391
+ org_id=org_id,
1392
+ )
1393
+
1394
+ return WorkerQueueCommandResponse(
1395
+ queue_id=queue_id,
1396
+ command=command,
1397
+ command_parts=command_parts,
1398
+ can_register=can_register,
1399
+ queue_status=queue_status,
1400
+ active_workers=active_workers,
1401
+ max_workers=queue.get("max_workers"),
1402
+ )
1403
+
1404
+ except HTTPException:
1405
+ raise
1406
+ except Exception as e:
1407
+ logger.error("worker_queue_command_failed", error=str(e), queue_id=queue_id)
1408
+ raise HTTPException(
1409
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
1410
+ detail=f"Failed to get worker queue command: {str(e)}"
1411
+ )
1412
+
1413
+
1414
+ def _generate_openshift_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
1415
+ """Generate OpenShift deployment YAML"""
1416
+ return f"""# Kubiya Agent Worker - OpenShift Deployment
1417
+ # Generated: {datetime.utcnow().isoformat()}
1418
+ #
1419
+ # To deploy:
1420
+ # 1. Create secret: oc create secret generic kubiya-worker-secret --from-literal=api-key=YOUR_API_KEY
1421
+ # 2. Apply this file: oc apply -f kubiya-worker.yaml
1422
+ #
1423
+ ---
1424
+ apiVersion: v1
1425
+ kind: ConfigMap
1426
+ metadata:
1427
+ name: kubiya-worker-{queue_name}-config
1428
+ labels:
1429
+ app: kubiya-worker
1430
+ queue: {queue_name}
1431
+ environment: {environment_name}
1432
+ data:
1433
+ WORKER_ID: "{worker_id}"
1434
+ CONTROL_PLANE_URL: "{control_plane_url}"
1435
+ LOG_LEVEL: "INFO"
1436
+
1437
+ ---
1438
+ apiVersion: apps.openshift.io/v1
1439
+ kind: DeploymentConfig
1440
+ metadata:
1441
+ name: kubiya-worker-{queue_name}
1442
+ labels:
1443
+ app: kubiya-worker
1444
+ queue: {queue_name}
1445
+ environment: {environment_name}
1446
+ spec:
1447
+ replicas: 1
1448
+ selector:
1449
+ app: kubiya-worker
1450
+ queue: {queue_name}
1451
+ template:
1452
+ metadata:
1453
+ labels:
1454
+ app: kubiya-worker
1455
+ queue: {queue_name}
1456
+ environment: {environment_name}
1457
+ spec:
1458
+ containers:
1459
+ - name: worker
1460
+ image: kubiya/agent-worker:latest
1461
+ imagePullPolicy: Always
1462
+ envFrom:
1463
+ - configMapRef:
1464
+ name: kubiya-worker-{queue_name}-config
1465
+ env:
1466
+ - name: KUBIYA_API_KEY
1467
+ valueFrom:
1468
+ secretKeyRef:
1469
+ name: kubiya-worker-secret
1470
+ key: api-key
1471
+ resources:
1472
+ requests:
1473
+ memory: "512Mi"
1474
+ cpu: "250m"
1475
+ limits:
1476
+ memory: "2Gi"
1477
+ cpu: "1000m"
1478
+ livenessProbe:
1479
+ httpGet:
1480
+ path: /health
1481
+ port: 8080
1482
+ initialDelaySeconds: 30
1483
+ periodSeconds: 30
1484
+ timeoutSeconds: 10
1485
+ failureThreshold: 3
1486
+ readinessProbe:
1487
+ httpGet:
1488
+ path: /health
1489
+ port: 8080
1490
+ initialDelaySeconds: 10
1491
+ periodSeconds: 10
1492
+ timeoutSeconds: 5
1493
+ failureThreshold: 3
1494
+ restartPolicy: Always
1495
+ securityContext:
1496
+ runAsNonRoot: true
1497
+ runAsUser: 1000
1498
+ triggers:
1499
+ - type: ConfigChange
1500
+ - type: ImageChange
1501
+ imageChangeParams:
1502
+ automatic: true
1503
+ containerNames:
1504
+ - worker
1505
+ from:
1506
+ kind: ImageStreamTag
1507
+ name: agent-worker:latest
1508
+
1509
+ ---
1510
+ apiVersion: v1
1511
+ kind: Service
1512
+ metadata:
1513
+ name: kubiya-worker-{queue_name}
1514
+ labels:
1515
+ app: kubiya-worker
1516
+ queue: {queue_name}
1517
+ spec:
1518
+ selector:
1519
+ app: kubiya-worker
1520
+ queue: {queue_name}
1521
+ ports:
1522
+ - protocol: TCP
1523
+ port: 8080
1524
+ targetPort: 8080
1525
+ type: ClusterIP
1526
+
1527
+ ---
1528
+ # Optional: Route to expose the service
1529
+ # apiVersion: route.openshift.io/v1
1530
+ # kind: Route
1531
+ # metadata:
1532
+ # name: kubiya-worker-{queue_name}
1533
+ # labels:
1534
+ # app: kubiya-worker
1535
+ # queue: {queue_name}
1536
+ # spec:
1537
+ # to:
1538
+ # kind: Service
1539
+ # name: kubiya-worker-{queue_name}
1540
+ # port:
1541
+ # targetPort: 8080
1542
+ # tls:
1543
+ # termination: edge
1544
+ # insecureEdgeTerminationPolicy: Redirect
1545
+ """