kubiya-control-plane-api 0.1.0__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kubiya-control-plane-api might be problematic. Click here for more details.
- control_plane_api/README.md +266 -0
- control_plane_api/__init__.py +0 -0
- control_plane_api/__version__.py +1 -0
- control_plane_api/alembic/README +1 -0
- control_plane_api/alembic/env.py +98 -0
- control_plane_api/alembic/script.py.mako +28 -0
- control_plane_api/alembic/versions/1382bec74309_initial_migration_with_all_models.py +251 -0
- control_plane_api/alembic/versions/1f54bc2a37e3_add_analytics_tables.py +162 -0
- control_plane_api/alembic/versions/2e4cb136dc10_rename_toolset_ids_to_skill_ids_in_teams.py +30 -0
- control_plane_api/alembic/versions/31cd69a644ce_add_skill_templates_table.py +28 -0
- control_plane_api/alembic/versions/89e127caa47d_add_jobs_and_job_executions_tables.py +161 -0
- control_plane_api/alembic/versions/add_llm_models_table.py +51 -0
- control_plane_api/alembic/versions/b0e10697f212_add_runtime_column_to_teams_simple.py +42 -0
- control_plane_api/alembic/versions/ce43b24b63bf_add_execution_trigger_source_and_fix_.py +155 -0
- control_plane_api/alembic/versions/d4eaf16e3f8d_rename_toolsets_to_skills.py +84 -0
- control_plane_api/alembic/versions/efa2dc427da1_rename_metadata_to_custom_metadata.py +32 -0
- control_plane_api/alembic/versions/f973b431d1ce_add_workflow_executor_to_skill_types.py +44 -0
- control_plane_api/alembic.ini +148 -0
- control_plane_api/api/index.py +12 -0
- control_plane_api/app/__init__.py +11 -0
- control_plane_api/app/activities/__init__.py +20 -0
- control_plane_api/app/activities/agent_activities.py +379 -0
- control_plane_api/app/activities/team_activities.py +410 -0
- control_plane_api/app/activities/temporal_cloud_activities.py +577 -0
- control_plane_api/app/config/__init__.py +35 -0
- control_plane_api/app/config/api_config.py +354 -0
- control_plane_api/app/config/model_pricing.py +318 -0
- control_plane_api/app/config.py +95 -0
- control_plane_api/app/database.py +135 -0
- control_plane_api/app/exceptions.py +408 -0
- control_plane_api/app/lib/__init__.py +11 -0
- control_plane_api/app/lib/job_executor.py +312 -0
- control_plane_api/app/lib/kubiya_client.py +235 -0
- control_plane_api/app/lib/litellm_pricing.py +166 -0
- control_plane_api/app/lib/planning_tools/__init__.py +22 -0
- control_plane_api/app/lib/planning_tools/agents.py +155 -0
- control_plane_api/app/lib/planning_tools/base.py +189 -0
- control_plane_api/app/lib/planning_tools/environments.py +214 -0
- control_plane_api/app/lib/planning_tools/resources.py +240 -0
- control_plane_api/app/lib/planning_tools/teams.py +198 -0
- control_plane_api/app/lib/policy_enforcer_client.py +939 -0
- control_plane_api/app/lib/redis_client.py +436 -0
- control_plane_api/app/lib/supabase.py +71 -0
- control_plane_api/app/lib/temporal_client.py +138 -0
- control_plane_api/app/lib/validation/__init__.py +20 -0
- control_plane_api/app/lib/validation/runtime_validation.py +287 -0
- control_plane_api/app/main.py +128 -0
- control_plane_api/app/middleware/__init__.py +8 -0
- control_plane_api/app/middleware/auth.py +513 -0
- control_plane_api/app/middleware/exception_handler.py +267 -0
- control_plane_api/app/middleware/rate_limiting.py +384 -0
- control_plane_api/app/middleware/request_id.py +202 -0
- control_plane_api/app/models/__init__.py +27 -0
- control_plane_api/app/models/agent.py +79 -0
- control_plane_api/app/models/analytics.py +206 -0
- control_plane_api/app/models/associations.py +81 -0
- control_plane_api/app/models/environment.py +63 -0
- control_plane_api/app/models/execution.py +93 -0
- control_plane_api/app/models/job.py +179 -0
- control_plane_api/app/models/llm_model.py +75 -0
- control_plane_api/app/models/presence.py +49 -0
- control_plane_api/app/models/project.py +47 -0
- control_plane_api/app/models/session.py +38 -0
- control_plane_api/app/models/team.py +66 -0
- control_plane_api/app/models/workflow.py +55 -0
- control_plane_api/app/policies/README.md +121 -0
- control_plane_api/app/policies/approved_users.rego +62 -0
- control_plane_api/app/policies/business_hours.rego +51 -0
- control_plane_api/app/policies/rate_limiting.rego +100 -0
- control_plane_api/app/policies/tool_restrictions.rego +86 -0
- control_plane_api/app/routers/__init__.py +4 -0
- control_plane_api/app/routers/agents.py +364 -0
- control_plane_api/app/routers/agents_v2.py +1260 -0
- control_plane_api/app/routers/analytics.py +1014 -0
- control_plane_api/app/routers/context_manager.py +562 -0
- control_plane_api/app/routers/environment_context.py +270 -0
- control_plane_api/app/routers/environments.py +715 -0
- control_plane_api/app/routers/execution_environment.py +517 -0
- control_plane_api/app/routers/executions.py +1911 -0
- control_plane_api/app/routers/health.py +92 -0
- control_plane_api/app/routers/health_v2.py +326 -0
- control_plane_api/app/routers/integrations.py +274 -0
- control_plane_api/app/routers/jobs.py +1344 -0
- control_plane_api/app/routers/models.py +82 -0
- control_plane_api/app/routers/models_v2.py +361 -0
- control_plane_api/app/routers/policies.py +639 -0
- control_plane_api/app/routers/presence.py +234 -0
- control_plane_api/app/routers/projects.py +902 -0
- control_plane_api/app/routers/runners.py +379 -0
- control_plane_api/app/routers/runtimes.py +172 -0
- control_plane_api/app/routers/secrets.py +155 -0
- control_plane_api/app/routers/skills.py +1001 -0
- control_plane_api/app/routers/skills_definitions.py +140 -0
- control_plane_api/app/routers/task_planning.py +1256 -0
- control_plane_api/app/routers/task_queues.py +654 -0
- control_plane_api/app/routers/team_context.py +270 -0
- control_plane_api/app/routers/teams.py +1400 -0
- control_plane_api/app/routers/worker_queues.py +1545 -0
- control_plane_api/app/routers/workers.py +935 -0
- control_plane_api/app/routers/workflows.py +204 -0
- control_plane_api/app/runtimes/__init__.py +6 -0
- control_plane_api/app/runtimes/validation.py +344 -0
- control_plane_api/app/schemas/job_schemas.py +295 -0
- control_plane_api/app/services/__init__.py +1 -0
- control_plane_api/app/services/agno_service.py +619 -0
- control_plane_api/app/services/litellm_service.py +190 -0
- control_plane_api/app/services/policy_service.py +525 -0
- control_plane_api/app/services/temporal_cloud_provisioning.py +150 -0
- control_plane_api/app/skills/__init__.py +44 -0
- control_plane_api/app/skills/base.py +229 -0
- control_plane_api/app/skills/business_intelligence.py +189 -0
- control_plane_api/app/skills/data_visualization.py +154 -0
- control_plane_api/app/skills/docker.py +104 -0
- control_plane_api/app/skills/file_generation.py +94 -0
- control_plane_api/app/skills/file_system.py +110 -0
- control_plane_api/app/skills/python.py +92 -0
- control_plane_api/app/skills/registry.py +65 -0
- control_plane_api/app/skills/shell.py +102 -0
- control_plane_api/app/skills/workflow_executor.py +469 -0
- control_plane_api/app/utils/workflow_executor.py +354 -0
- control_plane_api/app/workflows/__init__.py +11 -0
- control_plane_api/app/workflows/agent_execution.py +507 -0
- control_plane_api/app/workflows/agent_execution_with_skills.py +222 -0
- control_plane_api/app/workflows/namespace_provisioning.py +326 -0
- control_plane_api/app/workflows/team_execution.py +399 -0
- control_plane_api/scripts/seed_models.py +239 -0
- control_plane_api/worker/__init__.py +0 -0
- control_plane_api/worker/activities/__init__.py +0 -0
- control_plane_api/worker/activities/agent_activities.py +1241 -0
- control_plane_api/worker/activities/approval_activities.py +234 -0
- control_plane_api/worker/activities/runtime_activities.py +388 -0
- control_plane_api/worker/activities/skill_activities.py +267 -0
- control_plane_api/worker/activities/team_activities.py +1217 -0
- control_plane_api/worker/config/__init__.py +31 -0
- control_plane_api/worker/config/worker_config.py +275 -0
- control_plane_api/worker/control_plane_client.py +529 -0
- control_plane_api/worker/examples/analytics_integration_example.py +362 -0
- control_plane_api/worker/models/__init__.py +1 -0
- control_plane_api/worker/models/inputs.py +89 -0
- control_plane_api/worker/runtimes/__init__.py +31 -0
- control_plane_api/worker/runtimes/base.py +789 -0
- control_plane_api/worker/runtimes/claude_code_runtime.py +1443 -0
- control_plane_api/worker/runtimes/default_runtime.py +617 -0
- control_plane_api/worker/runtimes/factory.py +173 -0
- control_plane_api/worker/runtimes/validation.py +93 -0
- control_plane_api/worker/services/__init__.py +1 -0
- control_plane_api/worker/services/agent_executor.py +422 -0
- control_plane_api/worker/services/agent_executor_v2.py +383 -0
- control_plane_api/worker/services/analytics_collector.py +457 -0
- control_plane_api/worker/services/analytics_service.py +464 -0
- control_plane_api/worker/services/approval_tools.py +310 -0
- control_plane_api/worker/services/approval_tools_agno.py +207 -0
- control_plane_api/worker/services/cancellation_manager.py +177 -0
- control_plane_api/worker/services/data_visualization.py +827 -0
- control_plane_api/worker/services/jira_tools.py +257 -0
- control_plane_api/worker/services/runtime_analytics.py +328 -0
- control_plane_api/worker/services/session_service.py +194 -0
- control_plane_api/worker/services/skill_factory.py +175 -0
- control_plane_api/worker/services/team_executor.py +574 -0
- control_plane_api/worker/services/team_executor_v2.py +465 -0
- control_plane_api/worker/services/workflow_executor_tools.py +1418 -0
- control_plane_api/worker/tests/__init__.py +1 -0
- control_plane_api/worker/tests/e2e/__init__.py +0 -0
- control_plane_api/worker/tests/e2e/test_execution_flow.py +571 -0
- control_plane_api/worker/tests/integration/__init__.py +0 -0
- control_plane_api/worker/tests/integration/test_control_plane_integration.py +308 -0
- control_plane_api/worker/tests/unit/__init__.py +0 -0
- control_plane_api/worker/tests/unit/test_control_plane_client.py +401 -0
- control_plane_api/worker/utils/__init__.py +1 -0
- control_plane_api/worker/utils/chunk_batcher.py +305 -0
- control_plane_api/worker/utils/retry_utils.py +60 -0
- control_plane_api/worker/utils/streaming_utils.py +373 -0
- control_plane_api/worker/worker.py +753 -0
- control_plane_api/worker/workflows/__init__.py +0 -0
- control_plane_api/worker/workflows/agent_execution.py +589 -0
- control_plane_api/worker/workflows/team_execution.py +429 -0
- kubiya_control_plane_api-0.3.4.dist-info/METADATA +229 -0
- kubiya_control_plane_api-0.3.4.dist-info/RECORD +182 -0
- kubiya_control_plane_api-0.3.4.dist-info/entry_points.txt +2 -0
- kubiya_control_plane_api-0.3.4.dist-info/top_level.txt +1 -0
- kubiya_control_plane_api-0.1.0.dist-info/METADATA +0 -66
- kubiya_control_plane_api-0.1.0.dist-info/RECORD +0 -5
- kubiya_control_plane_api-0.1.0.dist-info/top_level.txt +0 -1
- {kubiya_control_plane_api-0.1.0.dist-info/licenses → control_plane_api}/LICENSE +0 -0
- {kubiya_control_plane_api-0.1.0.dist-info → kubiya_control_plane_api-0.3.4.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,1545 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Worker Queues router - Manage worker queues within environments.
|
|
3
|
+
|
|
4
|
+
Each environment can have multiple worker queues for fine-grained worker management.
|
|
5
|
+
Task queue naming: {org_id}.{environment_name}.{worker_queue_name}
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from fastapi import APIRouter, Depends, HTTPException, status, Request
|
|
9
|
+
from fastapi.responses import PlainTextResponse
|
|
10
|
+
from typing import List, Optional, Literal
|
|
11
|
+
from datetime import datetime, timedelta
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
import structlog
|
|
14
|
+
import uuid
|
|
15
|
+
import os
|
|
16
|
+
import json
|
|
17
|
+
|
|
18
|
+
from control_plane_api.app.middleware.auth import get_current_organization
|
|
19
|
+
from control_plane_api.app.lib.supabase import get_supabase
|
|
20
|
+
from control_plane_api.app.lib.redis_client import get_redis_client
|
|
21
|
+
|
|
22
|
+
logger = structlog.get_logger()
|
|
23
|
+
|
|
24
|
+
router = APIRouter()
|
|
25
|
+
|
|
26
|
+
# Stale worker threshold: 60 seconds (2x the default heartbeat interval of 30s)
|
|
27
|
+
STALE_WORKER_THRESHOLD_SECONDS = 60
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def get_active_workers_from_redis(org_id: str, queue_id: Optional[str] = None) -> dict:
|
|
31
|
+
"""
|
|
32
|
+
Get active workers from Redis heartbeats.
|
|
33
|
+
|
|
34
|
+
Redis heartbeats have automatic TTL (5 minutes), so if a worker hasn't sent a heartbeat
|
|
35
|
+
the key will automatically expire. This eliminates the need to manually mark workers as stale.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
org_id: Organization ID
|
|
39
|
+
queue_id: Optional queue ID to filter by
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Dict with worker_id -> heartbeat_data mapping
|
|
43
|
+
"""
|
|
44
|
+
redis_client = get_redis_client()
|
|
45
|
+
|
|
46
|
+
if not redis_client:
|
|
47
|
+
logger.warning("redis_unavailable_for_worker_query", org_id=org_id)
|
|
48
|
+
return {}
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Get all worker heartbeat keys for this org
|
|
52
|
+
# We need to get worker records from DB to map worker_id -> queue_id
|
|
53
|
+
client = get_supabase()
|
|
54
|
+
workers_db = (
|
|
55
|
+
client.table("worker_heartbeats")
|
|
56
|
+
.select("id, worker_queue_id")
|
|
57
|
+
.eq("organization_id", org_id)
|
|
58
|
+
.execute()
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
active_workers = {}
|
|
62
|
+
|
|
63
|
+
for worker in workers_db.data or []:
|
|
64
|
+
worker_id = worker["id"]
|
|
65
|
+
worker_queue_id = worker.get("worker_queue_id")
|
|
66
|
+
|
|
67
|
+
# Skip if queue_id filter is specified and doesn't match
|
|
68
|
+
if queue_id and worker_queue_id != queue_id:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# Check if heartbeat exists in Redis (within TTL window)
|
|
72
|
+
redis_key = f"worker:{worker_id}:heartbeat"
|
|
73
|
+
heartbeat_data = await redis_client.get(redis_key)
|
|
74
|
+
|
|
75
|
+
if heartbeat_data:
|
|
76
|
+
try:
|
|
77
|
+
data = json.loads(heartbeat_data)
|
|
78
|
+
# Check if heartbeat is recent (within threshold)
|
|
79
|
+
last_heartbeat = datetime.fromisoformat(data.get("last_heartbeat", ""))
|
|
80
|
+
age_seconds = (datetime.utcnow() - last_heartbeat).total_seconds()
|
|
81
|
+
|
|
82
|
+
if age_seconds <= STALE_WORKER_THRESHOLD_SECONDS:
|
|
83
|
+
active_workers[worker_id] = {
|
|
84
|
+
**data,
|
|
85
|
+
"worker_queue_id": worker_queue_id,
|
|
86
|
+
}
|
|
87
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
88
|
+
logger.warning("invalid_heartbeat_data", worker_id=worker_id, error=str(e))
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
return active_workers
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error("failed_to_get_active_workers_from_redis", error=str(e), org_id=org_id)
|
|
95
|
+
return {}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# Pydantic schemas
|
|
99
|
+
class WorkerQueueCreate(BaseModel):
|
|
100
|
+
name: str = Field(..., min_length=2, max_length=50, description="Worker queue name (lowercase, no spaces)")
|
|
101
|
+
display_name: Optional[str] = Field(None, description="User-friendly display name")
|
|
102
|
+
description: Optional[str] = Field(None, description="Queue description")
|
|
103
|
+
max_workers: Optional[int] = Field(None, ge=1, description="Max workers allowed (NULL = unlimited)")
|
|
104
|
+
heartbeat_interval: int = Field(30, ge=10, le=300, description="Seconds between heartbeats")
|
|
105
|
+
tags: List[str] = Field(default_factory=list)
|
|
106
|
+
settings: dict = Field(default_factory=dict)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class WorkerQueueUpdate(BaseModel):
|
|
110
|
+
name: Optional[str] = Field(None, min_length=2, max_length=50)
|
|
111
|
+
display_name: Optional[str] = None
|
|
112
|
+
description: Optional[str] = None
|
|
113
|
+
status: Optional[str] = None
|
|
114
|
+
max_workers: Optional[int] = Field(None, ge=1)
|
|
115
|
+
heartbeat_interval: Optional[int] = Field(None, ge=10, le=300)
|
|
116
|
+
tags: Optional[List[str]] = None
|
|
117
|
+
settings: Optional[dict] = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class WorkerQueueResponse(BaseModel):
|
|
121
|
+
id: str
|
|
122
|
+
organization_id: str
|
|
123
|
+
environment_id: str
|
|
124
|
+
name: str
|
|
125
|
+
display_name: Optional[str]
|
|
126
|
+
description: Optional[str]
|
|
127
|
+
status: str
|
|
128
|
+
max_workers: Optional[int]
|
|
129
|
+
heartbeat_interval: int
|
|
130
|
+
tags: List[str]
|
|
131
|
+
settings: dict
|
|
132
|
+
created_at: str
|
|
133
|
+
updated_at: str
|
|
134
|
+
created_by: Optional[str]
|
|
135
|
+
# Computed
|
|
136
|
+
active_workers: int = 0
|
|
137
|
+
task_queue_name: str # Full task queue name: org.env.worker_queue
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@router.get("/worker-queues", response_model=List[WorkerQueueResponse])
|
|
141
|
+
async def list_all_worker_queues(
|
|
142
|
+
request: Request,
|
|
143
|
+
organization: dict = Depends(get_current_organization),
|
|
144
|
+
):
|
|
145
|
+
"""List all worker queues across all environments for the organization"""
|
|
146
|
+
try:
|
|
147
|
+
client = get_supabase()
|
|
148
|
+
org_id = organization["id"]
|
|
149
|
+
|
|
150
|
+
# Get all worker queues for this organization
|
|
151
|
+
result = (
|
|
152
|
+
client.table("worker_queues")
|
|
153
|
+
.select("*, environments(name)")
|
|
154
|
+
.eq("organization_id", org_id)
|
|
155
|
+
.order("created_at", desc=False)
|
|
156
|
+
.execute()
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if not result.data:
|
|
160
|
+
return []
|
|
161
|
+
|
|
162
|
+
# Get active workers from Redis (with automatic TTL-based expiration)
|
|
163
|
+
active_workers = await get_active_workers_from_redis(org_id)
|
|
164
|
+
|
|
165
|
+
# Count workers per queue
|
|
166
|
+
worker_counts = {}
|
|
167
|
+
for worker_id, worker_data in active_workers.items():
|
|
168
|
+
queue_id = worker_data.get("worker_queue_id")
|
|
169
|
+
if queue_id:
|
|
170
|
+
worker_counts[queue_id] = worker_counts.get(queue_id, 0) + 1
|
|
171
|
+
|
|
172
|
+
# Build response
|
|
173
|
+
queues = []
|
|
174
|
+
for queue in result.data:
|
|
175
|
+
# Use queue UUID as task queue name for security
|
|
176
|
+
task_queue_name = queue["id"]
|
|
177
|
+
active_worker_count = worker_counts.get(queue["id"], 0)
|
|
178
|
+
|
|
179
|
+
# Get environment name from join
|
|
180
|
+
env_data = queue.get("environments")
|
|
181
|
+
environment_name = env_data.get("name") if env_data else None
|
|
182
|
+
|
|
183
|
+
queue_copy = dict(queue)
|
|
184
|
+
queue_copy.pop("environments", None) # Remove join data
|
|
185
|
+
|
|
186
|
+
queues.append(
|
|
187
|
+
WorkerQueueResponse(
|
|
188
|
+
**queue_copy,
|
|
189
|
+
active_workers=active_worker_count,
|
|
190
|
+
task_queue_name=task_queue_name,
|
|
191
|
+
environment_name=environment_name,
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
logger.info(
|
|
196
|
+
"all_worker_queues_listed",
|
|
197
|
+
count=len(queues),
|
|
198
|
+
org_id=org_id,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return queues
|
|
202
|
+
|
|
203
|
+
except HTTPException:
|
|
204
|
+
raise
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.error("all_worker_queues_list_failed", error=str(e), org_id=org_id)
|
|
207
|
+
raise HTTPException(
|
|
208
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
209
|
+
detail=f"Failed to list all worker queues: {str(e)}"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@router.post("/environments/{environment_id}/worker-queues", response_model=WorkerQueueResponse, status_code=status.HTTP_201_CREATED)
|
|
214
|
+
async def create_worker_queue(
|
|
215
|
+
environment_id: str,
|
|
216
|
+
queue_data: WorkerQueueCreate,
|
|
217
|
+
request: Request,
|
|
218
|
+
organization: dict = Depends(get_current_organization),
|
|
219
|
+
):
|
|
220
|
+
"""Create a new worker queue within an environment"""
|
|
221
|
+
try:
|
|
222
|
+
client = get_supabase()
|
|
223
|
+
org_id = organization["id"]
|
|
224
|
+
|
|
225
|
+
# Validate environment exists
|
|
226
|
+
env_result = (
|
|
227
|
+
client.table("environments")
|
|
228
|
+
.select("id, name")
|
|
229
|
+
.eq("id", environment_id)
|
|
230
|
+
.eq("organization_id", org_id)
|
|
231
|
+
.single()
|
|
232
|
+
.execute()
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
if not env_result.data:
|
|
236
|
+
raise HTTPException(
|
|
237
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
238
|
+
detail="Environment not found"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
environment = env_result.data
|
|
242
|
+
|
|
243
|
+
# Check if worker queue name already exists in this environment
|
|
244
|
+
existing = (
|
|
245
|
+
client.table("worker_queues")
|
|
246
|
+
.select("id")
|
|
247
|
+
.eq("environment_id", environment_id)
|
|
248
|
+
.eq("name", queue_data.name)
|
|
249
|
+
.execute()
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if existing.data:
|
|
253
|
+
raise HTTPException(
|
|
254
|
+
status_code=status.HTTP_409_CONFLICT,
|
|
255
|
+
detail=f"Worker queue '{queue_data.name}' already exists in this environment"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Create worker queue
|
|
259
|
+
queue_id = str(uuid.uuid4())
|
|
260
|
+
now = datetime.utcnow().isoformat()
|
|
261
|
+
|
|
262
|
+
queue_record = {
|
|
263
|
+
"id": queue_id,
|
|
264
|
+
"organization_id": org_id,
|
|
265
|
+
"environment_id": environment_id,
|
|
266
|
+
"name": queue_data.name,
|
|
267
|
+
"display_name": queue_data.display_name or queue_data.name,
|
|
268
|
+
"description": queue_data.description,
|
|
269
|
+
"status": "active",
|
|
270
|
+
"max_workers": queue_data.max_workers,
|
|
271
|
+
"heartbeat_interval": queue_data.heartbeat_interval,
|
|
272
|
+
"tags": queue_data.tags,
|
|
273
|
+
"settings": queue_data.settings,
|
|
274
|
+
"created_at": now,
|
|
275
|
+
"updated_at": now,
|
|
276
|
+
"created_by": organization.get("user_id"),
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
result = client.table("worker_queues").insert(queue_record).execute()
|
|
280
|
+
|
|
281
|
+
if not result.data:
|
|
282
|
+
raise HTTPException(
|
|
283
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
284
|
+
detail="Failed to create worker queue"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
queue = result.data[0]
|
|
288
|
+
|
|
289
|
+
# Use queue UUID as task queue name for security (unpredictable)
|
|
290
|
+
task_queue_name = queue_id
|
|
291
|
+
|
|
292
|
+
logger.info(
|
|
293
|
+
"worker_queue_created",
|
|
294
|
+
queue_id=queue_id,
|
|
295
|
+
queue_name=queue["name"],
|
|
296
|
+
environment_id=environment_id,
|
|
297
|
+
task_queue_name=task_queue_name,
|
|
298
|
+
org_id=org_id,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return WorkerQueueResponse(
|
|
302
|
+
**queue,
|
|
303
|
+
active_workers=0,
|
|
304
|
+
task_queue_name=task_queue_name,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
except HTTPException:
|
|
308
|
+
raise
|
|
309
|
+
except Exception as e:
|
|
310
|
+
logger.error("worker_queue_creation_failed", error=str(e), org_id=organization["id"])
|
|
311
|
+
raise HTTPException(
|
|
312
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
313
|
+
detail=f"Failed to create worker queue: {str(e)}"
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
@router.get("/environments/{environment_id}/worker-queues", response_model=List[WorkerQueueResponse])
|
|
318
|
+
async def list_worker_queues(
|
|
319
|
+
environment_id: str,
|
|
320
|
+
request: Request,
|
|
321
|
+
organization: dict = Depends(get_current_organization),
|
|
322
|
+
):
|
|
323
|
+
"""List all worker queues in an environment"""
|
|
324
|
+
try:
|
|
325
|
+
client = get_supabase()
|
|
326
|
+
org_id = organization["id"]
|
|
327
|
+
|
|
328
|
+
# Get environment name
|
|
329
|
+
env_result = (
|
|
330
|
+
client.table("environments")
|
|
331
|
+
.select("name")
|
|
332
|
+
.eq("id", environment_id)
|
|
333
|
+
.eq("organization_id", org_id)
|
|
334
|
+
.single()
|
|
335
|
+
.execute()
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
if not env_result.data:
|
|
339
|
+
raise HTTPException(
|
|
340
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
341
|
+
detail="Environment not found"
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
environment_name = env_result.data["name"]
|
|
345
|
+
|
|
346
|
+
# Get worker queues
|
|
347
|
+
result = (
|
|
348
|
+
client.table("worker_queues")
|
|
349
|
+
.select("*")
|
|
350
|
+
.eq("environment_id", environment_id)
|
|
351
|
+
.order("created_at", desc=False)
|
|
352
|
+
.execute()
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if not result.data:
|
|
356
|
+
return []
|
|
357
|
+
|
|
358
|
+
# Get active workers from Redis (with automatic TTL-based expiration)
|
|
359
|
+
active_workers = await get_active_workers_from_redis(org_id)
|
|
360
|
+
|
|
361
|
+
# Count workers per queue
|
|
362
|
+
worker_counts = {}
|
|
363
|
+
for worker_id, worker_data in active_workers.items():
|
|
364
|
+
queue_id = worker_data.get("worker_queue_id")
|
|
365
|
+
if queue_id:
|
|
366
|
+
worker_counts[queue_id] = worker_counts.get(queue_id, 0) + 1
|
|
367
|
+
|
|
368
|
+
# Build response
|
|
369
|
+
queues = []
|
|
370
|
+
for queue in result.data:
|
|
371
|
+
# Use queue UUID as task queue name for security
|
|
372
|
+
task_queue_name = queue["id"]
|
|
373
|
+
active_workers = worker_counts.get(queue["id"], 0)
|
|
374
|
+
|
|
375
|
+
queues.append(
|
|
376
|
+
WorkerQueueResponse(
|
|
377
|
+
**queue,
|
|
378
|
+
active_workers=active_workers,
|
|
379
|
+
task_queue_name=task_queue_name,
|
|
380
|
+
)
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
logger.info(
|
|
384
|
+
"worker_queues_listed",
|
|
385
|
+
count=len(queues),
|
|
386
|
+
environment_id=environment_id,
|
|
387
|
+
org_id=org_id,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
return queues
|
|
391
|
+
|
|
392
|
+
except HTTPException:
|
|
393
|
+
raise
|
|
394
|
+
except Exception as e:
|
|
395
|
+
logger.error("worker_queues_list_failed", error=str(e), environment_id=environment_id)
|
|
396
|
+
raise HTTPException(
|
|
397
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
398
|
+
detail=f"Failed to list worker queues: {str(e)}"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
@router.get("/worker-queues/{queue_id}", response_model=WorkerQueueResponse)
|
|
403
|
+
async def get_worker_queue(
|
|
404
|
+
queue_id: str,
|
|
405
|
+
request: Request,
|
|
406
|
+
organization: dict = Depends(get_current_organization),
|
|
407
|
+
):
|
|
408
|
+
"""Get a specific worker queue by ID"""
|
|
409
|
+
try:
|
|
410
|
+
client = get_supabase()
|
|
411
|
+
org_id = organization["id"]
|
|
412
|
+
|
|
413
|
+
# Get worker queue
|
|
414
|
+
result = (
|
|
415
|
+
client.table("worker_queues")
|
|
416
|
+
.select("*")
|
|
417
|
+
.eq("id", queue_id)
|
|
418
|
+
.eq("organization_id", org_id)
|
|
419
|
+
.single()
|
|
420
|
+
.execute()
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
if not result.data:
|
|
424
|
+
raise HTTPException(
|
|
425
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
426
|
+
detail="Worker queue not found"
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
queue = result.data
|
|
430
|
+
|
|
431
|
+
# Get environment name separately
|
|
432
|
+
environment_name = "unknown"
|
|
433
|
+
if queue.get("environment_id"):
|
|
434
|
+
env_result = (
|
|
435
|
+
client.table("environments")
|
|
436
|
+
.select("name")
|
|
437
|
+
.eq("id", queue["environment_id"])
|
|
438
|
+
.eq("organization_id", org_id)
|
|
439
|
+
.maybe_single()
|
|
440
|
+
.execute()
|
|
441
|
+
)
|
|
442
|
+
if env_result.data:
|
|
443
|
+
environment_name = env_result.data["name"]
|
|
444
|
+
|
|
445
|
+
# Get active workers from Redis for this specific queue
|
|
446
|
+
active_workers_dict = await get_active_workers_from_redis(org_id, queue_id)
|
|
447
|
+
active_workers = len(active_workers_dict)
|
|
448
|
+
|
|
449
|
+
# Remove joined data
|
|
450
|
+
queue.pop("environments", None)
|
|
451
|
+
|
|
452
|
+
# Use queue UUID as task queue name for security
|
|
453
|
+
task_queue_name = queue_id
|
|
454
|
+
|
|
455
|
+
return WorkerQueueResponse(
|
|
456
|
+
**queue,
|
|
457
|
+
active_workers=active_workers,
|
|
458
|
+
task_queue_name=task_queue_name,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
except HTTPException:
|
|
462
|
+
raise
|
|
463
|
+
except Exception as e:
|
|
464
|
+
logger.error("worker_queue_get_failed", error=str(e), queue_id=queue_id)
|
|
465
|
+
raise HTTPException(
|
|
466
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
467
|
+
detail=f"Failed to get worker queue: {str(e)}"
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
@router.patch("/worker-queues/{queue_id}", response_model=WorkerQueueResponse)
|
|
472
|
+
async def update_worker_queue(
|
|
473
|
+
queue_id: str,
|
|
474
|
+
queue_data: WorkerQueueUpdate,
|
|
475
|
+
request: Request,
|
|
476
|
+
organization: dict = Depends(get_current_organization),
|
|
477
|
+
):
|
|
478
|
+
"""Update a worker queue"""
|
|
479
|
+
try:
|
|
480
|
+
client = get_supabase()
|
|
481
|
+
org_id = organization["id"]
|
|
482
|
+
|
|
483
|
+
# Check if queue exists
|
|
484
|
+
existing = (
|
|
485
|
+
client.table("worker_queues")
|
|
486
|
+
.select("id, environment_id")
|
|
487
|
+
.eq("id", queue_id)
|
|
488
|
+
.eq("organization_id", org_id)
|
|
489
|
+
.single()
|
|
490
|
+
.execute()
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
if not existing.data:
|
|
494
|
+
raise HTTPException(
|
|
495
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
496
|
+
detail="Worker queue not found"
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Build update dict
|
|
500
|
+
update_data = queue_data.model_dump(exclude_unset=True)
|
|
501
|
+
update_data["updated_at"] = datetime.utcnow().isoformat()
|
|
502
|
+
|
|
503
|
+
# Update queue
|
|
504
|
+
result = (
|
|
505
|
+
client.table("worker_queues")
|
|
506
|
+
.update(update_data)
|
|
507
|
+
.eq("id", queue_id)
|
|
508
|
+
.eq("organization_id", org_id)
|
|
509
|
+
.execute()
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
if not result.data:
|
|
513
|
+
raise HTTPException(
|
|
514
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
515
|
+
detail="Failed to update worker queue"
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
queue = result.data[0]
|
|
519
|
+
|
|
520
|
+
# Get environment name and active workers
|
|
521
|
+
env_result = (
|
|
522
|
+
client.table("environments")
|
|
523
|
+
.select("name")
|
|
524
|
+
.eq("id", queue["environment_id"])
|
|
525
|
+
.single()
|
|
526
|
+
.execute()
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
environment_name = env_result.data["name"] if env_result.data else "unknown"
|
|
530
|
+
|
|
531
|
+
workers_result = (
|
|
532
|
+
client.table("worker_heartbeats")
|
|
533
|
+
.select("id")
|
|
534
|
+
.eq("worker_queue_id", queue_id)
|
|
535
|
+
.in_("status", ["active", "idle", "busy"])
|
|
536
|
+
.execute()
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
active_workers = len(workers_result.data or [])
|
|
540
|
+
# Use queue UUID as task queue name for security
|
|
541
|
+
task_queue_name = queue_id
|
|
542
|
+
|
|
543
|
+
logger.info(
|
|
544
|
+
"worker_queue_updated",
|
|
545
|
+
queue_id=queue_id,
|
|
546
|
+
org_id=org_id,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
return WorkerQueueResponse(
|
|
550
|
+
**queue,
|
|
551
|
+
active_workers=active_workers,
|
|
552
|
+
task_queue_name=task_queue_name,
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
except HTTPException:
|
|
556
|
+
raise
|
|
557
|
+
except Exception as e:
|
|
558
|
+
logger.error("worker_queue_update_failed", error=str(e), queue_id=queue_id)
|
|
559
|
+
raise HTTPException(
|
|
560
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
561
|
+
detail=f"Failed to update worker queue: {str(e)}"
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
@router.delete("/worker-queues/{queue_id}", status_code=status.HTTP_204_NO_CONTENT)
|
|
566
|
+
async def delete_worker_queue(
|
|
567
|
+
queue_id: str,
|
|
568
|
+
request: Request,
|
|
569
|
+
organization: dict = Depends(get_current_organization),
|
|
570
|
+
):
|
|
571
|
+
"""Delete a worker queue"""
|
|
572
|
+
try:
|
|
573
|
+
client = get_supabase()
|
|
574
|
+
org_id = organization["id"]
|
|
575
|
+
|
|
576
|
+
# Prevent deleting default queue
|
|
577
|
+
queue_check = (
|
|
578
|
+
client.table("worker_queues")
|
|
579
|
+
.select("name")
|
|
580
|
+
.eq("id", queue_id)
|
|
581
|
+
.eq("organization_id", org_id)
|
|
582
|
+
.single()
|
|
583
|
+
.execute()
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
if queue_check.data and queue_check.data.get("name") == "default":
|
|
587
|
+
raise HTTPException(
|
|
588
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
589
|
+
detail="Cannot delete the default worker queue"
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
# Check for active workers in Redis
|
|
593
|
+
active_workers = await get_active_workers_from_redis(org_id, queue_id)
|
|
594
|
+
|
|
595
|
+
if active_workers:
|
|
596
|
+
raise HTTPException(
|
|
597
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
598
|
+
detail=f"Cannot delete worker queue with {len(active_workers)} active workers"
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
# Delete queue
|
|
602
|
+
result = (
|
|
603
|
+
client.table("worker_queues")
|
|
604
|
+
.delete()
|
|
605
|
+
.eq("id", queue_id)
|
|
606
|
+
.eq("organization_id", org_id)
|
|
607
|
+
.execute()
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
if not result.data:
|
|
611
|
+
raise HTTPException(
|
|
612
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
613
|
+
detail="Worker queue not found"
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
logger.info("worker_queue_deleted", queue_id=queue_id, org_id=org_id)
|
|
617
|
+
|
|
618
|
+
return None
|
|
619
|
+
|
|
620
|
+
except HTTPException:
|
|
621
|
+
raise
|
|
622
|
+
except Exception as e:
|
|
623
|
+
logger.error("worker_queue_delete_failed", error=str(e), queue_id=queue_id)
|
|
624
|
+
raise HTTPException(
|
|
625
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
626
|
+
detail=f"Failed to delete worker queue: {str(e)}"
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
@router.get("/worker-queues/{queue_id}/install-script")
|
|
631
|
+
async def get_installation_script(
|
|
632
|
+
queue_id: str,
|
|
633
|
+
deployment_type: Literal["docker", "kubernetes", "openshift", "local"] = "local",
|
|
634
|
+
request: Request = None,
|
|
635
|
+
organization: dict = Depends(get_current_organization),
|
|
636
|
+
):
|
|
637
|
+
"""
|
|
638
|
+
Generate an installation script for setting up a worker for this queue.
|
|
639
|
+
|
|
640
|
+
Supports multiple deployment types:
|
|
641
|
+
- local: Python virtual environment setup
|
|
642
|
+
- docker: Docker run command
|
|
643
|
+
- kubernetes: Kubernetes deployment YAML
|
|
644
|
+
- openshift: OpenShift deployment YAML
|
|
645
|
+
"""
|
|
646
|
+
try:
|
|
647
|
+
client = get_supabase()
|
|
648
|
+
org_id = organization["id"]
|
|
649
|
+
|
|
650
|
+
# Get worker queue details
|
|
651
|
+
result = (
|
|
652
|
+
client.table("worker_queues")
|
|
653
|
+
.select("*")
|
|
654
|
+
.eq("id", queue_id)
|
|
655
|
+
.eq("organization_id", org_id)
|
|
656
|
+
.single()
|
|
657
|
+
.execute()
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
if not result.data:
|
|
661
|
+
raise HTTPException(
|
|
662
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
663
|
+
detail="Worker queue not found"
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
queue = result.data
|
|
667
|
+
|
|
668
|
+
# Get environment name separately
|
|
669
|
+
environment_name = "default"
|
|
670
|
+
if queue.get("environment_id"):
|
|
671
|
+
env_result = (
|
|
672
|
+
client.table("environments")
|
|
673
|
+
.select("name")
|
|
674
|
+
.eq("id", queue["environment_id"])
|
|
675
|
+
.eq("organization_id", org_id)
|
|
676
|
+
.maybe_single()
|
|
677
|
+
.execute()
|
|
678
|
+
)
|
|
679
|
+
if env_result.data:
|
|
680
|
+
environment_name = env_result.data["name"]
|
|
681
|
+
queue_name = queue["name"]
|
|
682
|
+
|
|
683
|
+
# Get control plane URL
|
|
684
|
+
control_plane_url = os.getenv("CONTROL_PLANE_URL", "https://agent-control-plane.vercel.app")
|
|
685
|
+
|
|
686
|
+
# Generate new worker ID
|
|
687
|
+
worker_id = str(uuid.uuid4())
|
|
688
|
+
|
|
689
|
+
# Generate script based on deployment type
|
|
690
|
+
if deployment_type == "local":
|
|
691
|
+
script = _generate_local_script(worker_id, control_plane_url)
|
|
692
|
+
elif deployment_type == "docker":
|
|
693
|
+
script = _generate_docker_script(worker_id, control_plane_url, queue_name, environment_name)
|
|
694
|
+
elif deployment_type == "kubernetes":
|
|
695
|
+
script = _generate_kubernetes_script(worker_id, control_plane_url, queue_name, environment_name)
|
|
696
|
+
elif deployment_type == "openshift":
|
|
697
|
+
script = _generate_openshift_script(worker_id, control_plane_url, queue_name, environment_name)
|
|
698
|
+
else:
|
|
699
|
+
raise HTTPException(
|
|
700
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
701
|
+
detail=f"Unsupported deployment type: {deployment_type}"
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
logger.info(
|
|
705
|
+
"installation_script_generated",
|
|
706
|
+
queue_id=queue_id,
|
|
707
|
+
deployment_type=deployment_type,
|
|
708
|
+
worker_id=worker_id,
|
|
709
|
+
org_id=org_id,
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
return PlainTextResponse(content=script, media_type="text/plain")
|
|
713
|
+
|
|
714
|
+
except HTTPException:
|
|
715
|
+
raise
|
|
716
|
+
except Exception as e:
|
|
717
|
+
logger.error("installation_script_generation_failed", error=str(e), queue_id=queue_id)
|
|
718
|
+
raise HTTPException(
|
|
719
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
720
|
+
detail=f"Failed to generate installation script: {str(e)}"
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
class WorkerStartResponse(BaseModel):
|
|
725
|
+
"""Worker start configuration"""
|
|
726
|
+
worker_id: str
|
|
727
|
+
task_queue_name: str # The queue UUID
|
|
728
|
+
temporal_namespace: str
|
|
729
|
+
temporal_host: str
|
|
730
|
+
temporal_api_key: str
|
|
731
|
+
organization_id: str
|
|
732
|
+
control_plane_url: str
|
|
733
|
+
heartbeat_interval: int
|
|
734
|
+
# LiteLLM configuration for agno workflows/activities
|
|
735
|
+
litellm_api_url: str
|
|
736
|
+
litellm_api_key: str
|
|
737
|
+
# Queue metadata
|
|
738
|
+
queue_name: str
|
|
739
|
+
environment_name: str
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
@router.post("/worker-queues/{queue_id}/start", response_model=WorkerStartResponse)
|
|
743
|
+
async def start_worker_for_queue(
|
|
744
|
+
queue_id: str,
|
|
745
|
+
request: Request,
|
|
746
|
+
organization: dict = Depends(get_current_organization),
|
|
747
|
+
):
|
|
748
|
+
"""
|
|
749
|
+
Start a worker for a specific queue.
|
|
750
|
+
|
|
751
|
+
This endpoint is called by the CLI with: kubiya worker start --queue-id={queue_id}
|
|
752
|
+
|
|
753
|
+
Returns all configuration needed for the worker to connect to Temporal.
|
|
754
|
+
"""
|
|
755
|
+
try:
|
|
756
|
+
client = get_supabase()
|
|
757
|
+
org_id = organization["id"]
|
|
758
|
+
|
|
759
|
+
# Get worker queue - use maybe_single to avoid exception on missing rows
|
|
760
|
+
try:
|
|
761
|
+
result = (
|
|
762
|
+
client.table("worker_queues")
|
|
763
|
+
.select("*")
|
|
764
|
+
.eq("id", queue_id)
|
|
765
|
+
.eq("organization_id", org_id)
|
|
766
|
+
.maybe_single()
|
|
767
|
+
.execute()
|
|
768
|
+
)
|
|
769
|
+
except Exception as db_error:
|
|
770
|
+
# Handle postgrest 204 No Content response (queue not found)
|
|
771
|
+
error_str = str(db_error)
|
|
772
|
+
if "'code': '204'" in error_str or "Missing response" in error_str:
|
|
773
|
+
# Treat 204 as "no data found" rather than an error
|
|
774
|
+
result = type('obj', (object,), {'data': None})()
|
|
775
|
+
else:
|
|
776
|
+
logger.error("database_query_failed", error=str(db_error), queue_id=queue_id)
|
|
777
|
+
raise HTTPException(
|
|
778
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
779
|
+
detail=f"Database query failed. Please contact support."
|
|
780
|
+
) from db_error
|
|
781
|
+
|
|
782
|
+
if not result or not result.data:
|
|
783
|
+
# Check if queue exists at all (might be in different org)
|
|
784
|
+
check_result = (
|
|
785
|
+
client.table("worker_queues")
|
|
786
|
+
.select("id, organization_id")
|
|
787
|
+
.eq("id", queue_id)
|
|
788
|
+
.maybe_single()
|
|
789
|
+
.execute()
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
if check_result and check_result.data:
|
|
793
|
+
raise HTTPException(
|
|
794
|
+
status_code=status.HTTP_403_FORBIDDEN,
|
|
795
|
+
detail=f"Worker queue '{queue_id}' not found in your organization"
|
|
796
|
+
)
|
|
797
|
+
else:
|
|
798
|
+
raise HTTPException(
|
|
799
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
800
|
+
detail=f"Worker queue '{queue_id}' does not exist. Please create a queue from the UI first."
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
queue = result.data
|
|
804
|
+
|
|
805
|
+
# Get environment/task_queue separately
|
|
806
|
+
if not queue.get("environment_id"):
|
|
807
|
+
raise HTTPException(
|
|
808
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
809
|
+
detail=f"Worker queue '{queue.get('name', queue_id)}' has no environment configured. Please contact support."
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
env_result = (
|
|
813
|
+
client.table("environments")
|
|
814
|
+
.select("name")
|
|
815
|
+
.eq("id", queue["environment_id"])
|
|
816
|
+
.eq("organization_id", org_id)
|
|
817
|
+
.maybe_single()
|
|
818
|
+
.execute()
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
if not env_result or not env_result.data:
|
|
822
|
+
raise HTTPException(
|
|
823
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
824
|
+
detail=f"Environment configuration error for queue '{queue.get('name', queue_id)}'. Please contact support."
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
environment_name = env_result.data["name"]
|
|
828
|
+
|
|
829
|
+
# Check if queue is active
|
|
830
|
+
if queue.get("status") != "active":
|
|
831
|
+
raise HTTPException(
|
|
832
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
833
|
+
detail=f"Worker queue is not active (status: {queue.get('status')})"
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
# TEMPORARY: Use fixed namespace + admin API key
|
|
837
|
+
import os
|
|
838
|
+
namespace = {
|
|
839
|
+
"namespace_name": "agent-control-plane.lpagu",
|
|
840
|
+
"api_key_encrypted": os.getenv("TEMPORAL_CLOUD_ADMIN_TOKEN", ""),
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
# Generate worker ID
|
|
844
|
+
worker_id = str(uuid.uuid4())
|
|
845
|
+
|
|
846
|
+
# Create worker heartbeat record
|
|
847
|
+
now = datetime.utcnow().isoformat()
|
|
848
|
+
worker_record = {
|
|
849
|
+
"id": worker_id,
|
|
850
|
+
"worker_id": worker_id,
|
|
851
|
+
"organization_id": org_id,
|
|
852
|
+
"worker_queue_id": queue_id,
|
|
853
|
+
"environment_name": environment_name,
|
|
854
|
+
"status": "active",
|
|
855
|
+
"tasks_processed": 0,
|
|
856
|
+
"registered_at": now,
|
|
857
|
+
"last_heartbeat": now,
|
|
858
|
+
"updated_at": now,
|
|
859
|
+
"worker_metadata": {},
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
client.table("worker_heartbeats").insert(worker_record).execute()
|
|
863
|
+
|
|
864
|
+
# Get control plane URL
|
|
865
|
+
control_plane_url = os.getenv("CONTROL_PLANE_URL", "https://agent-control-plane.vercel.app")
|
|
866
|
+
temporal_host = os.getenv("TEMPORAL_HOST", "us-east-1.aws.api.temporal.io:7233")
|
|
867
|
+
|
|
868
|
+
# Get LiteLLM configuration for agno workflows/activities
|
|
869
|
+
litellm_api_url = os.getenv("LITELLM_API_URL", "https://llm-proxy.kubiya.ai")
|
|
870
|
+
litellm_api_key = os.getenv("LITELLM_API_KEY", "")
|
|
871
|
+
|
|
872
|
+
# Task queue name is just the queue UUID for security
|
|
873
|
+
task_queue_name = queue_id
|
|
874
|
+
|
|
875
|
+
logger.info(
|
|
876
|
+
"worker_started_for_queue",
|
|
877
|
+
worker_id=worker_id,
|
|
878
|
+
queue_id=queue_id,
|
|
879
|
+
task_queue_name=task_queue_name,
|
|
880
|
+
org_id=org_id,
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
return WorkerStartResponse(
|
|
884
|
+
worker_id=worker_id,
|
|
885
|
+
task_queue_name=task_queue_name,
|
|
886
|
+
temporal_namespace=namespace["namespace_name"],
|
|
887
|
+
temporal_host=temporal_host,
|
|
888
|
+
temporal_api_key=namespace["api_key_encrypted"],
|
|
889
|
+
organization_id=org_id,
|
|
890
|
+
control_plane_url=control_plane_url,
|
|
891
|
+
heartbeat_interval=queue.get("heartbeat_interval", 30),
|
|
892
|
+
litellm_api_url=litellm_api_url,
|
|
893
|
+
litellm_api_key=litellm_api_key,
|
|
894
|
+
queue_name=queue["name"],
|
|
895
|
+
environment_name=environment_name,
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
except HTTPException:
|
|
899
|
+
raise
|
|
900
|
+
except Exception as e:
|
|
901
|
+
logger.error(
|
|
902
|
+
"worker_start_for_queue_failed",
|
|
903
|
+
error=str(e),
|
|
904
|
+
error_type=type(e).__name__,
|
|
905
|
+
queue_id=queue_id,
|
|
906
|
+
org_id=organization.get("id")
|
|
907
|
+
)
|
|
908
|
+
raise HTTPException(
|
|
909
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
910
|
+
detail=f"Failed to start worker due to an internal error. Please try again or contact support. (Error ID: {queue_id[:8]})"
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
def _generate_local_script(worker_id: str, control_plane_url: str) -> str:
|
|
915
|
+
"""Generate a bash script for local Python installation"""
|
|
916
|
+
return f"""#!/bin/bash
|
|
917
|
+
# Kubiya Agent Worker - Local Installation Script
|
|
918
|
+
# Generated: {datetime.utcnow().isoformat()}
|
|
919
|
+
|
|
920
|
+
set -e
|
|
921
|
+
|
|
922
|
+
echo "🚀 Setting up Kubiya Agent Worker..."
|
|
923
|
+
echo ""
|
|
924
|
+
|
|
925
|
+
# Configuration
|
|
926
|
+
WORKER_ID="{worker_id}"
|
|
927
|
+
CONTROL_PLANE_URL="{control_plane_url}"
|
|
928
|
+
|
|
929
|
+
# Check if KUBIYA_API_KEY is set
|
|
930
|
+
if [ -z "$KUBIYA_API_KEY" ]; then
|
|
931
|
+
echo "❌ Error: KUBIYA_API_KEY environment variable is not set"
|
|
932
|
+
echo "Please set it with: export KUBIYA_API_KEY=your-api-key"
|
|
933
|
+
exit 1
|
|
934
|
+
fi
|
|
935
|
+
|
|
936
|
+
# Check Python version
|
|
937
|
+
if ! command -v python3 &> /dev/null; then
|
|
938
|
+
echo "❌ Error: Python 3 is not installed"
|
|
939
|
+
exit 1
|
|
940
|
+
fi
|
|
941
|
+
|
|
942
|
+
PYTHON_VERSION=$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2)
|
|
943
|
+
echo "✓ Found Python $PYTHON_VERSION"
|
|
944
|
+
|
|
945
|
+
# Create directory
|
|
946
|
+
WORKER_DIR="$HOME/.kubiya/workers/$WORKER_ID"
|
|
947
|
+
mkdir -p "$WORKER_DIR"
|
|
948
|
+
cd "$WORKER_DIR"
|
|
949
|
+
|
|
950
|
+
echo "✓ Created worker directory: $WORKER_DIR"
|
|
951
|
+
|
|
952
|
+
# Create virtual environment
|
|
953
|
+
echo "📦 Creating virtual environment..."
|
|
954
|
+
python3 -m venv venv
|
|
955
|
+
source venv/bin/activate
|
|
956
|
+
|
|
957
|
+
# Install dependencies
|
|
958
|
+
echo "📦 Installing dependencies..."
|
|
959
|
+
pip install --quiet --upgrade pip
|
|
960
|
+
pip install --quiet \\
|
|
961
|
+
temporalio>=1.5.0 \\
|
|
962
|
+
httpx>=0.27.0 \\
|
|
963
|
+
structlog>=24.1.0 \\
|
|
964
|
+
psutil>=5.9.0 \\
|
|
965
|
+
agno-sdk>=0.1.0 \\
|
|
966
|
+
litellm>=1.35.0
|
|
967
|
+
|
|
968
|
+
echo "✓ Dependencies installed"
|
|
969
|
+
|
|
970
|
+
# Download worker script
|
|
971
|
+
echo "📥 Downloading worker script..."
|
|
972
|
+
curl -s -o worker.py https://raw.githubusercontent.com/kubiya-sandbox/orchestrator/main/agent-worker/worker.py
|
|
973
|
+
|
|
974
|
+
echo "✓ Worker script downloaded"
|
|
975
|
+
|
|
976
|
+
# Create systemd service file (optional)
|
|
977
|
+
cat > kubiya-worker.service <<EOF
|
|
978
|
+
[Unit]
|
|
979
|
+
Description=Kubiya Agent Worker
|
|
980
|
+
After=network.target
|
|
981
|
+
|
|
982
|
+
[Service]
|
|
983
|
+
Type=simple
|
|
984
|
+
User=$USER
|
|
985
|
+
WorkingDirectory=$WORKER_DIR
|
|
986
|
+
Environment="WORKER_ID=$WORKER_ID"
|
|
987
|
+
Environment="KUBIYA_API_KEY=$KUBIYA_API_KEY"
|
|
988
|
+
Environment="CONTROL_PLANE_URL=$CONTROL_PLANE_URL"
|
|
989
|
+
ExecStart=$WORKER_DIR/venv/bin/python $WORKER_DIR/worker.py
|
|
990
|
+
Restart=always
|
|
991
|
+
RestartSec=10
|
|
992
|
+
|
|
993
|
+
[Install]
|
|
994
|
+
WantedBy=multi-user.target
|
|
995
|
+
EOF
|
|
996
|
+
|
|
997
|
+
echo "✓ Systemd service file created (optional)"
|
|
998
|
+
|
|
999
|
+
# Create run script
|
|
1000
|
+
cat > run.sh <<EOF
|
|
1001
|
+
#!/bin/bash
|
|
1002
|
+
cd "$WORKER_DIR"
|
|
1003
|
+
source venv/bin/activate
|
|
1004
|
+
export WORKER_ID="$WORKER_ID"
|
|
1005
|
+
export KUBIYA_API_KEY="$KUBIYA_API_KEY"
|
|
1006
|
+
export CONTROL_PLANE_URL="$CONTROL_PLANE_URL"
|
|
1007
|
+
python worker.py
|
|
1008
|
+
EOF
|
|
1009
|
+
|
|
1010
|
+
chmod +x run.sh
|
|
1011
|
+
|
|
1012
|
+
echo ""
|
|
1013
|
+
echo "✅ Installation complete!"
|
|
1014
|
+
echo ""
|
|
1015
|
+
echo "To start the worker:"
|
|
1016
|
+
echo " cd $WORKER_DIR && ./run.sh"
|
|
1017
|
+
echo ""
|
|
1018
|
+
echo "Or to install as a systemd service:"
|
|
1019
|
+
echo " sudo cp $WORKER_DIR/kubiya-worker.service /etc/systemd/system/"
|
|
1020
|
+
echo " sudo systemctl daemon-reload"
|
|
1021
|
+
echo " sudo systemctl enable kubiya-worker"
|
|
1022
|
+
echo " sudo systemctl start kubiya-worker"
|
|
1023
|
+
echo ""
|
|
1024
|
+
"""
|
|
1025
|
+
|
|
1026
|
+
|
|
1027
|
+
def _generate_docker_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
|
|
1028
|
+
"""Generate Docker commands for running the worker"""
|
|
1029
|
+
return f"""# Kubiya Agent Worker - Docker Installation
|
|
1030
|
+
# Generated: {datetime.utcnow().isoformat()}
|
|
1031
|
+
|
|
1032
|
+
# Configuration
|
|
1033
|
+
WORKER_ID="{worker_id}"
|
|
1034
|
+
CONTROL_PLANE_URL="{control_plane_url}"
|
|
1035
|
+
QUEUE_NAME="{queue_name}"
|
|
1036
|
+
ENVIRONMENT_NAME="{environment_name}"
|
|
1037
|
+
|
|
1038
|
+
# Make sure to set your API key
|
|
1039
|
+
# export KUBIYA_API_KEY=your-api-key
|
|
1040
|
+
|
|
1041
|
+
# Run with Docker
|
|
1042
|
+
docker run -d \\
|
|
1043
|
+
--name kubiya-worker-{queue_name}-{worker_id[:8]} \\
|
|
1044
|
+
--restart unless-stopped \\
|
|
1045
|
+
-e WORKER_ID="$WORKER_ID" \\
|
|
1046
|
+
-e KUBIYA_API_KEY="$KUBIYA_API_KEY" \\
|
|
1047
|
+
-e CONTROL_PLANE_URL="$CONTROL_PLANE_URL" \\
|
|
1048
|
+
-e LOG_LEVEL="INFO" \\
|
|
1049
|
+
kubiya/agent-worker:latest
|
|
1050
|
+
|
|
1051
|
+
# Check logs
|
|
1052
|
+
# docker logs -f kubiya-worker-{queue_name}-{worker_id[:8]}
|
|
1053
|
+
|
|
1054
|
+
# Stop worker
|
|
1055
|
+
# docker stop kubiya-worker-{queue_name}-{worker_id[:8]}
|
|
1056
|
+
|
|
1057
|
+
# Remove worker
|
|
1058
|
+
# docker rm kubiya-worker-{queue_name}-{worker_id[:8]}
|
|
1059
|
+
|
|
1060
|
+
# Docker Compose (save as docker-compose.yml)
|
|
1061
|
+
cat > docker-compose.yml <<EOF
|
|
1062
|
+
version: '3.8'
|
|
1063
|
+
|
|
1064
|
+
services:
|
|
1065
|
+
worker:
|
|
1066
|
+
image: kubiya/agent-worker:latest
|
|
1067
|
+
container_name: kubiya-worker-{queue_name}
|
|
1068
|
+
restart: unless-stopped
|
|
1069
|
+
environment:
|
|
1070
|
+
- WORKER_ID={worker_id}
|
|
1071
|
+
- KUBIYA_API_KEY=${{KUBIYA_API_KEY}}
|
|
1072
|
+
- CONTROL_PLANE_URL={control_plane_url}
|
|
1073
|
+
- LOG_LEVEL=INFO
|
|
1074
|
+
healthcheck:
|
|
1075
|
+
test: ["CMD", "python", "-c", "import httpx; httpx.get('{control_plane_url}/health')"]
|
|
1076
|
+
interval: 30s
|
|
1077
|
+
timeout: 10s
|
|
1078
|
+
retries: 3
|
|
1079
|
+
start_period: 10s
|
|
1080
|
+
EOF
|
|
1081
|
+
|
|
1082
|
+
# To use docker-compose:
|
|
1083
|
+
# docker-compose up -d
|
|
1084
|
+
"""
|
|
1085
|
+
|
|
1086
|
+
|
|
1087
|
+
def _generate_kubernetes_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
|
|
1088
|
+
"""Generate Kubernetes deployment YAML"""
|
|
1089
|
+
return f"""# Kubiya Agent Worker - Kubernetes Deployment
|
|
1090
|
+
# Generated: {datetime.utcnow().isoformat()}
|
|
1091
|
+
#
|
|
1092
|
+
# To deploy:
|
|
1093
|
+
# 1. Create secret: kubectl create secret generic kubiya-worker-secret --from-literal=api-key=YOUR_API_KEY
|
|
1094
|
+
# 2. Apply this file: kubectl apply -f kubiya-worker.yaml
|
|
1095
|
+
#
|
|
1096
|
+
---
|
|
1097
|
+
apiVersion: v1
|
|
1098
|
+
kind: ConfigMap
|
|
1099
|
+
metadata:
|
|
1100
|
+
name: kubiya-worker-{queue_name}-config
|
|
1101
|
+
labels:
|
|
1102
|
+
app: kubiya-worker
|
|
1103
|
+
queue: {queue_name}
|
|
1104
|
+
environment: {environment_name}
|
|
1105
|
+
data:
|
|
1106
|
+
WORKER_ID: "{worker_id}"
|
|
1107
|
+
CONTROL_PLANE_URL: "{control_plane_url}"
|
|
1108
|
+
LOG_LEVEL: "INFO"
|
|
1109
|
+
|
|
1110
|
+
---
|
|
1111
|
+
apiVersion: apps/v1
|
|
1112
|
+
kind: Deployment
|
|
1113
|
+
metadata:
|
|
1114
|
+
name: kubiya-worker-{queue_name}
|
|
1115
|
+
labels:
|
|
1116
|
+
app: kubiya-worker
|
|
1117
|
+
queue: {queue_name}
|
|
1118
|
+
environment: {environment_name}
|
|
1119
|
+
spec:
|
|
1120
|
+
replicas: 1
|
|
1121
|
+
selector:
|
|
1122
|
+
matchLabels:
|
|
1123
|
+
app: kubiya-worker
|
|
1124
|
+
queue: {queue_name}
|
|
1125
|
+
template:
|
|
1126
|
+
metadata:
|
|
1127
|
+
labels:
|
|
1128
|
+
app: kubiya-worker
|
|
1129
|
+
queue: {queue_name}
|
|
1130
|
+
environment: {environment_name}
|
|
1131
|
+
spec:
|
|
1132
|
+
containers:
|
|
1133
|
+
- name: worker
|
|
1134
|
+
image: kubiya/agent-worker:latest
|
|
1135
|
+
imagePullPolicy: Always
|
|
1136
|
+
envFrom:
|
|
1137
|
+
- configMapRef:
|
|
1138
|
+
name: kubiya-worker-{queue_name}-config
|
|
1139
|
+
env:
|
|
1140
|
+
- name: KUBIYA_API_KEY
|
|
1141
|
+
valueFrom:
|
|
1142
|
+
secretKeyRef:
|
|
1143
|
+
name: kubiya-worker-secret
|
|
1144
|
+
key: api-key
|
|
1145
|
+
resources:
|
|
1146
|
+
requests:
|
|
1147
|
+
memory: "512Mi"
|
|
1148
|
+
cpu: "250m"
|
|
1149
|
+
limits:
|
|
1150
|
+
memory: "2Gi"
|
|
1151
|
+
cpu: "1000m"
|
|
1152
|
+
livenessProbe:
|
|
1153
|
+
httpGet:
|
|
1154
|
+
path: /health
|
|
1155
|
+
port: 8080
|
|
1156
|
+
initialDelaySeconds: 30
|
|
1157
|
+
periodSeconds: 30
|
|
1158
|
+
timeoutSeconds: 10
|
|
1159
|
+
failureThreshold: 3
|
|
1160
|
+
readinessProbe:
|
|
1161
|
+
httpGet:
|
|
1162
|
+
path: /health
|
|
1163
|
+
port: 8080
|
|
1164
|
+
initialDelaySeconds: 10
|
|
1165
|
+
periodSeconds: 10
|
|
1166
|
+
timeoutSeconds: 5
|
|
1167
|
+
failureThreshold: 3
|
|
1168
|
+
restartPolicy: Always
|
|
1169
|
+
|
|
1170
|
+
---
|
|
1171
|
+
apiVersion: v1
|
|
1172
|
+
kind: Service
|
|
1173
|
+
metadata:
|
|
1174
|
+
name: kubiya-worker-{queue_name}
|
|
1175
|
+
labels:
|
|
1176
|
+
app: kubiya-worker
|
|
1177
|
+
queue: {queue_name}
|
|
1178
|
+
spec:
|
|
1179
|
+
selector:
|
|
1180
|
+
app: kubiya-worker
|
|
1181
|
+
queue: {queue_name}
|
|
1182
|
+
ports:
|
|
1183
|
+
- protocol: TCP
|
|
1184
|
+
port: 8080
|
|
1185
|
+
targetPort: 8080
|
|
1186
|
+
type: ClusterIP
|
|
1187
|
+
|
|
1188
|
+
---
|
|
1189
|
+
# Optional: HorizontalPodAutoscaler
|
|
1190
|
+
# apiVersion: autoscaling/v2
|
|
1191
|
+
# kind: HorizontalPodAutoscaler
|
|
1192
|
+
# metadata:
|
|
1193
|
+
# name: kubiya-worker-{queue_name}
|
|
1194
|
+
# spec:
|
|
1195
|
+
# scaleTargetRef:
|
|
1196
|
+
# apiVersion: apps/v1
|
|
1197
|
+
# kind: Deployment
|
|
1198
|
+
# name: kubiya-worker-{queue_name}
|
|
1199
|
+
# minReplicas: 1
|
|
1200
|
+
# maxReplicas: 10
|
|
1201
|
+
# metrics:
|
|
1202
|
+
# - type: Resource
|
|
1203
|
+
# resource:
|
|
1204
|
+
# name: cpu
|
|
1205
|
+
# target:
|
|
1206
|
+
# type: Utilization
|
|
1207
|
+
# averageUtilization: 70
|
|
1208
|
+
"""
|
|
1209
|
+
|
|
1210
|
+
|
|
1211
|
+
class WorkerQueueCommandResponse(BaseModel):
|
|
1212
|
+
"""Worker queue connection command"""
|
|
1213
|
+
queue_id: str
|
|
1214
|
+
command: str
|
|
1215
|
+
command_parts: dict
|
|
1216
|
+
can_register: bool
|
|
1217
|
+
queue_status: str
|
|
1218
|
+
active_workers: int
|
|
1219
|
+
max_workers: Optional[int]
|
|
1220
|
+
|
|
1221
|
+
|
|
1222
|
+
class WorkerSystemInfo(BaseModel):
|
|
1223
|
+
"""Worker system information"""
|
|
1224
|
+
hostname: Optional[str] = None
|
|
1225
|
+
platform: Optional[str] = None
|
|
1226
|
+
os_name: Optional[str] = None
|
|
1227
|
+
os_version: Optional[str] = None
|
|
1228
|
+
python_version: Optional[str] = None
|
|
1229
|
+
cli_version: Optional[str] = None
|
|
1230
|
+
docker_available: Optional[bool] = None
|
|
1231
|
+
docker_version: Optional[str] = None
|
|
1232
|
+
cpu_count: Optional[int] = None
|
|
1233
|
+
cpu_percent: Optional[float] = None
|
|
1234
|
+
memory_total: Optional[int] = None
|
|
1235
|
+
memory_used: Optional[int] = None
|
|
1236
|
+
memory_percent: Optional[float] = None
|
|
1237
|
+
disk_total: Optional[int] = None
|
|
1238
|
+
disk_used: Optional[int] = None
|
|
1239
|
+
disk_percent: Optional[float] = None
|
|
1240
|
+
uptime_seconds: Optional[float] = None
|
|
1241
|
+
|
|
1242
|
+
|
|
1243
|
+
class WorkerDetail(BaseModel):
|
|
1244
|
+
"""Individual worker details"""
|
|
1245
|
+
id: str
|
|
1246
|
+
worker_id: str
|
|
1247
|
+
status: str
|
|
1248
|
+
tasks_processed: int
|
|
1249
|
+
current_task_id: Optional[str]
|
|
1250
|
+
last_heartbeat: str
|
|
1251
|
+
registered_at: str
|
|
1252
|
+
system_info: Optional[WorkerSystemInfo] = None
|
|
1253
|
+
logs: Optional[List[str]] = None
|
|
1254
|
+
worker_metadata: dict
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
@router.get("/worker-queues/{queue_id}/workers", response_model=List[WorkerDetail])
|
|
1258
|
+
async def list_queue_workers(
|
|
1259
|
+
queue_id: str,
|
|
1260
|
+
request: Request,
|
|
1261
|
+
organization: dict = Depends(get_current_organization),
|
|
1262
|
+
):
|
|
1263
|
+
"""
|
|
1264
|
+
List all workers for a specific queue with detailed information.
|
|
1265
|
+
"""
|
|
1266
|
+
try:
|
|
1267
|
+
client = get_supabase()
|
|
1268
|
+
org_id = organization["id"]
|
|
1269
|
+
|
|
1270
|
+
# Get active workers from Redis for this queue
|
|
1271
|
+
active_workers = await get_active_workers_from_redis(org_id, queue_id)
|
|
1272
|
+
|
|
1273
|
+
# Get worker registration details from database (registered_at, worker_id)
|
|
1274
|
+
if active_workers:
|
|
1275
|
+
db_workers = (
|
|
1276
|
+
client.table("worker_heartbeats")
|
|
1277
|
+
.select("id, worker_id, registered_at")
|
|
1278
|
+
.eq("organization_id", org_id)
|
|
1279
|
+
.in_("id", list(active_workers.keys()))
|
|
1280
|
+
.execute()
|
|
1281
|
+
)
|
|
1282
|
+
db_workers_map = {w["id"]: w for w in (db_workers.data or [])}
|
|
1283
|
+
else:
|
|
1284
|
+
db_workers_map = {}
|
|
1285
|
+
|
|
1286
|
+
workers = []
|
|
1287
|
+
for worker_id, heartbeat_data in active_workers.items():
|
|
1288
|
+
# Get DB data for registration time
|
|
1289
|
+
db_data = db_workers_map.get(worker_id, {})
|
|
1290
|
+
|
|
1291
|
+
# Extract system info and logs from Redis heartbeat data
|
|
1292
|
+
metadata = heartbeat_data.get("metadata", {})
|
|
1293
|
+
system_info_data = heartbeat_data.get("system_info")
|
|
1294
|
+
logs = heartbeat_data.get("logs", [])
|
|
1295
|
+
|
|
1296
|
+
system_info = WorkerSystemInfo(**system_info_data) if system_info_data else None
|
|
1297
|
+
|
|
1298
|
+
workers.append(
|
|
1299
|
+
WorkerDetail(
|
|
1300
|
+
id=worker_id,
|
|
1301
|
+
worker_id=db_data.get("worker_id", worker_id),
|
|
1302
|
+
status=heartbeat_data.get("status", "unknown"),
|
|
1303
|
+
tasks_processed=heartbeat_data.get("tasks_processed", 0),
|
|
1304
|
+
current_task_id=heartbeat_data.get("current_task_id"),
|
|
1305
|
+
last_heartbeat=heartbeat_data.get("last_heartbeat", ""),
|
|
1306
|
+
registered_at=db_data.get("registered_at", ""),
|
|
1307
|
+
system_info=system_info,
|
|
1308
|
+
logs=logs,
|
|
1309
|
+
worker_metadata=metadata,
|
|
1310
|
+
)
|
|
1311
|
+
)
|
|
1312
|
+
|
|
1313
|
+
# Sort by last_heartbeat desc
|
|
1314
|
+
workers.sort(key=lambda w: w.last_heartbeat, reverse=True)
|
|
1315
|
+
|
|
1316
|
+
logger.info(
|
|
1317
|
+
"queue_workers_listed",
|
|
1318
|
+
queue_id=queue_id,
|
|
1319
|
+
worker_count=len(workers),
|
|
1320
|
+
org_id=org_id,
|
|
1321
|
+
)
|
|
1322
|
+
|
|
1323
|
+
return workers
|
|
1324
|
+
|
|
1325
|
+
except HTTPException:
|
|
1326
|
+
raise
|
|
1327
|
+
except Exception as e:
|
|
1328
|
+
logger.error("queue_workers_list_failed", error=str(e), queue_id=queue_id)
|
|
1329
|
+
raise HTTPException(
|
|
1330
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1331
|
+
detail=f"Failed to list queue workers: {str(e)}"
|
|
1332
|
+
)
|
|
1333
|
+
|
|
1334
|
+
|
|
1335
|
+
@router.get("/worker-queues/{queue_id}/worker-command", response_model=WorkerQueueCommandResponse)
|
|
1336
|
+
async def get_worker_queue_command(
|
|
1337
|
+
queue_id: str,
|
|
1338
|
+
request: Request,
|
|
1339
|
+
organization: dict = Depends(get_current_organization),
|
|
1340
|
+
):
|
|
1341
|
+
"""
|
|
1342
|
+
Get the worker registration command for a specific worker queue.
|
|
1343
|
+
|
|
1344
|
+
Returns the kubiya worker start command with the queue ID that users
|
|
1345
|
+
should run to start a worker for this specific queue.
|
|
1346
|
+
"""
|
|
1347
|
+
try:
|
|
1348
|
+
client = get_supabase()
|
|
1349
|
+
org_id = organization["id"]
|
|
1350
|
+
|
|
1351
|
+
# Get worker queue
|
|
1352
|
+
result = (
|
|
1353
|
+
client.table("worker_queues")
|
|
1354
|
+
.select("*")
|
|
1355
|
+
.eq("id", queue_id)
|
|
1356
|
+
.eq("organization_id", org_id)
|
|
1357
|
+
.single()
|
|
1358
|
+
.execute()
|
|
1359
|
+
)
|
|
1360
|
+
|
|
1361
|
+
if not result.data:
|
|
1362
|
+
raise HTTPException(status_code=404, detail="Worker queue not found")
|
|
1363
|
+
|
|
1364
|
+
queue = result.data
|
|
1365
|
+
queue_status = queue.get("status", "unknown")
|
|
1366
|
+
|
|
1367
|
+
# Check if queue is active
|
|
1368
|
+
can_register = queue_status == "active"
|
|
1369
|
+
|
|
1370
|
+
# Get active workers from Redis for this specific queue
|
|
1371
|
+
active_workers_dict = await get_active_workers_from_redis(org_id, queue_id)
|
|
1372
|
+
active_workers = len(active_workers_dict)
|
|
1373
|
+
|
|
1374
|
+
# Build command
|
|
1375
|
+
command = f"kubiya worker start --queue-id {queue_id}"
|
|
1376
|
+
|
|
1377
|
+
command_parts = {
|
|
1378
|
+
"binary": "kubiya",
|
|
1379
|
+
"subcommand": "worker start",
|
|
1380
|
+
"flags": {
|
|
1381
|
+
"--queue-id": queue_id,
|
|
1382
|
+
},
|
|
1383
|
+
}
|
|
1384
|
+
|
|
1385
|
+
logger.info(
|
|
1386
|
+
"worker_queue_command_retrieved",
|
|
1387
|
+
queue_id=queue_id,
|
|
1388
|
+
can_register=can_register,
|
|
1389
|
+
status=queue_status,
|
|
1390
|
+
active_workers=active_workers,
|
|
1391
|
+
org_id=org_id,
|
|
1392
|
+
)
|
|
1393
|
+
|
|
1394
|
+
return WorkerQueueCommandResponse(
|
|
1395
|
+
queue_id=queue_id,
|
|
1396
|
+
command=command,
|
|
1397
|
+
command_parts=command_parts,
|
|
1398
|
+
can_register=can_register,
|
|
1399
|
+
queue_status=queue_status,
|
|
1400
|
+
active_workers=active_workers,
|
|
1401
|
+
max_workers=queue.get("max_workers"),
|
|
1402
|
+
)
|
|
1403
|
+
|
|
1404
|
+
except HTTPException:
|
|
1405
|
+
raise
|
|
1406
|
+
except Exception as e:
|
|
1407
|
+
logger.error("worker_queue_command_failed", error=str(e), queue_id=queue_id)
|
|
1408
|
+
raise HTTPException(
|
|
1409
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1410
|
+
detail=f"Failed to get worker queue command: {str(e)}"
|
|
1411
|
+
)
|
|
1412
|
+
|
|
1413
|
+
|
|
1414
|
+
def _generate_openshift_script(worker_id: str, control_plane_url: str, queue_name: str, environment_name: str) -> str:
|
|
1415
|
+
"""Generate OpenShift deployment YAML"""
|
|
1416
|
+
return f"""# Kubiya Agent Worker - OpenShift Deployment
|
|
1417
|
+
# Generated: {datetime.utcnow().isoformat()}
|
|
1418
|
+
#
|
|
1419
|
+
# To deploy:
|
|
1420
|
+
# 1. Create secret: oc create secret generic kubiya-worker-secret --from-literal=api-key=YOUR_API_KEY
|
|
1421
|
+
# 2. Apply this file: oc apply -f kubiya-worker.yaml
|
|
1422
|
+
#
|
|
1423
|
+
---
|
|
1424
|
+
apiVersion: v1
|
|
1425
|
+
kind: ConfigMap
|
|
1426
|
+
metadata:
|
|
1427
|
+
name: kubiya-worker-{queue_name}-config
|
|
1428
|
+
labels:
|
|
1429
|
+
app: kubiya-worker
|
|
1430
|
+
queue: {queue_name}
|
|
1431
|
+
environment: {environment_name}
|
|
1432
|
+
data:
|
|
1433
|
+
WORKER_ID: "{worker_id}"
|
|
1434
|
+
CONTROL_PLANE_URL: "{control_plane_url}"
|
|
1435
|
+
LOG_LEVEL: "INFO"
|
|
1436
|
+
|
|
1437
|
+
---
|
|
1438
|
+
apiVersion: apps.openshift.io/v1
|
|
1439
|
+
kind: DeploymentConfig
|
|
1440
|
+
metadata:
|
|
1441
|
+
name: kubiya-worker-{queue_name}
|
|
1442
|
+
labels:
|
|
1443
|
+
app: kubiya-worker
|
|
1444
|
+
queue: {queue_name}
|
|
1445
|
+
environment: {environment_name}
|
|
1446
|
+
spec:
|
|
1447
|
+
replicas: 1
|
|
1448
|
+
selector:
|
|
1449
|
+
app: kubiya-worker
|
|
1450
|
+
queue: {queue_name}
|
|
1451
|
+
template:
|
|
1452
|
+
metadata:
|
|
1453
|
+
labels:
|
|
1454
|
+
app: kubiya-worker
|
|
1455
|
+
queue: {queue_name}
|
|
1456
|
+
environment: {environment_name}
|
|
1457
|
+
spec:
|
|
1458
|
+
containers:
|
|
1459
|
+
- name: worker
|
|
1460
|
+
image: kubiya/agent-worker:latest
|
|
1461
|
+
imagePullPolicy: Always
|
|
1462
|
+
envFrom:
|
|
1463
|
+
- configMapRef:
|
|
1464
|
+
name: kubiya-worker-{queue_name}-config
|
|
1465
|
+
env:
|
|
1466
|
+
- name: KUBIYA_API_KEY
|
|
1467
|
+
valueFrom:
|
|
1468
|
+
secretKeyRef:
|
|
1469
|
+
name: kubiya-worker-secret
|
|
1470
|
+
key: api-key
|
|
1471
|
+
resources:
|
|
1472
|
+
requests:
|
|
1473
|
+
memory: "512Mi"
|
|
1474
|
+
cpu: "250m"
|
|
1475
|
+
limits:
|
|
1476
|
+
memory: "2Gi"
|
|
1477
|
+
cpu: "1000m"
|
|
1478
|
+
livenessProbe:
|
|
1479
|
+
httpGet:
|
|
1480
|
+
path: /health
|
|
1481
|
+
port: 8080
|
|
1482
|
+
initialDelaySeconds: 30
|
|
1483
|
+
periodSeconds: 30
|
|
1484
|
+
timeoutSeconds: 10
|
|
1485
|
+
failureThreshold: 3
|
|
1486
|
+
readinessProbe:
|
|
1487
|
+
httpGet:
|
|
1488
|
+
path: /health
|
|
1489
|
+
port: 8080
|
|
1490
|
+
initialDelaySeconds: 10
|
|
1491
|
+
periodSeconds: 10
|
|
1492
|
+
timeoutSeconds: 5
|
|
1493
|
+
failureThreshold: 3
|
|
1494
|
+
restartPolicy: Always
|
|
1495
|
+
securityContext:
|
|
1496
|
+
runAsNonRoot: true
|
|
1497
|
+
runAsUser: 1000
|
|
1498
|
+
triggers:
|
|
1499
|
+
- type: ConfigChange
|
|
1500
|
+
- type: ImageChange
|
|
1501
|
+
imageChangeParams:
|
|
1502
|
+
automatic: true
|
|
1503
|
+
containerNames:
|
|
1504
|
+
- worker
|
|
1505
|
+
from:
|
|
1506
|
+
kind: ImageStreamTag
|
|
1507
|
+
name: agent-worker:latest
|
|
1508
|
+
|
|
1509
|
+
---
|
|
1510
|
+
apiVersion: v1
|
|
1511
|
+
kind: Service
|
|
1512
|
+
metadata:
|
|
1513
|
+
name: kubiya-worker-{queue_name}
|
|
1514
|
+
labels:
|
|
1515
|
+
app: kubiya-worker
|
|
1516
|
+
queue: {queue_name}
|
|
1517
|
+
spec:
|
|
1518
|
+
selector:
|
|
1519
|
+
app: kubiya-worker
|
|
1520
|
+
queue: {queue_name}
|
|
1521
|
+
ports:
|
|
1522
|
+
- protocol: TCP
|
|
1523
|
+
port: 8080
|
|
1524
|
+
targetPort: 8080
|
|
1525
|
+
type: ClusterIP
|
|
1526
|
+
|
|
1527
|
+
---
|
|
1528
|
+
# Optional: Route to expose the service
|
|
1529
|
+
# apiVersion: route.openshift.io/v1
|
|
1530
|
+
# kind: Route
|
|
1531
|
+
# metadata:
|
|
1532
|
+
# name: kubiya-worker-{queue_name}
|
|
1533
|
+
# labels:
|
|
1534
|
+
# app: kubiya-worker
|
|
1535
|
+
# queue: {queue_name}
|
|
1536
|
+
# spec:
|
|
1537
|
+
# to:
|
|
1538
|
+
# kind: Service
|
|
1539
|
+
# name: kubiya-worker-{queue_name}
|
|
1540
|
+
# port:
|
|
1541
|
+
# targetPort: 8080
|
|
1542
|
+
# tls:
|
|
1543
|
+
# termination: edge
|
|
1544
|
+
# insecureEdgeTerminationPolicy: Redirect
|
|
1545
|
+
"""
|