kubiya-control-plane-api 0.1.0__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kubiya-control-plane-api might be problematic. Click here for more details.
- control_plane_api/README.md +266 -0
- control_plane_api/__init__.py +0 -0
- control_plane_api/__version__.py +1 -0
- control_plane_api/alembic/README +1 -0
- control_plane_api/alembic/env.py +98 -0
- control_plane_api/alembic/script.py.mako +28 -0
- control_plane_api/alembic/versions/1382bec74309_initial_migration_with_all_models.py +251 -0
- control_plane_api/alembic/versions/1f54bc2a37e3_add_analytics_tables.py +162 -0
- control_plane_api/alembic/versions/2e4cb136dc10_rename_toolset_ids_to_skill_ids_in_teams.py +30 -0
- control_plane_api/alembic/versions/31cd69a644ce_add_skill_templates_table.py +28 -0
- control_plane_api/alembic/versions/89e127caa47d_add_jobs_and_job_executions_tables.py +161 -0
- control_plane_api/alembic/versions/add_llm_models_table.py +51 -0
- control_plane_api/alembic/versions/b0e10697f212_add_runtime_column_to_teams_simple.py +42 -0
- control_plane_api/alembic/versions/ce43b24b63bf_add_execution_trigger_source_and_fix_.py +155 -0
- control_plane_api/alembic/versions/d4eaf16e3f8d_rename_toolsets_to_skills.py +84 -0
- control_plane_api/alembic/versions/efa2dc427da1_rename_metadata_to_custom_metadata.py +32 -0
- control_plane_api/alembic/versions/f973b431d1ce_add_workflow_executor_to_skill_types.py +44 -0
- control_plane_api/alembic.ini +148 -0
- control_plane_api/api/index.py +12 -0
- control_plane_api/app/__init__.py +11 -0
- control_plane_api/app/activities/__init__.py +20 -0
- control_plane_api/app/activities/agent_activities.py +379 -0
- control_plane_api/app/activities/team_activities.py +410 -0
- control_plane_api/app/activities/temporal_cloud_activities.py +577 -0
- control_plane_api/app/config/__init__.py +35 -0
- control_plane_api/app/config/api_config.py +354 -0
- control_plane_api/app/config/model_pricing.py +318 -0
- control_plane_api/app/config.py +95 -0
- control_plane_api/app/database.py +135 -0
- control_plane_api/app/exceptions.py +408 -0
- control_plane_api/app/lib/__init__.py +11 -0
- control_plane_api/app/lib/job_executor.py +312 -0
- control_plane_api/app/lib/kubiya_client.py +235 -0
- control_plane_api/app/lib/litellm_pricing.py +166 -0
- control_plane_api/app/lib/planning_tools/__init__.py +22 -0
- control_plane_api/app/lib/planning_tools/agents.py +155 -0
- control_plane_api/app/lib/planning_tools/base.py +189 -0
- control_plane_api/app/lib/planning_tools/environments.py +214 -0
- control_plane_api/app/lib/planning_tools/resources.py +240 -0
- control_plane_api/app/lib/planning_tools/teams.py +198 -0
- control_plane_api/app/lib/policy_enforcer_client.py +939 -0
- control_plane_api/app/lib/redis_client.py +436 -0
- control_plane_api/app/lib/supabase.py +71 -0
- control_plane_api/app/lib/temporal_client.py +138 -0
- control_plane_api/app/lib/validation/__init__.py +20 -0
- control_plane_api/app/lib/validation/runtime_validation.py +287 -0
- control_plane_api/app/main.py +128 -0
- control_plane_api/app/middleware/__init__.py +8 -0
- control_plane_api/app/middleware/auth.py +513 -0
- control_plane_api/app/middleware/exception_handler.py +267 -0
- control_plane_api/app/middleware/rate_limiting.py +384 -0
- control_plane_api/app/middleware/request_id.py +202 -0
- control_plane_api/app/models/__init__.py +27 -0
- control_plane_api/app/models/agent.py +79 -0
- control_plane_api/app/models/analytics.py +206 -0
- control_plane_api/app/models/associations.py +81 -0
- control_plane_api/app/models/environment.py +63 -0
- control_plane_api/app/models/execution.py +93 -0
- control_plane_api/app/models/job.py +179 -0
- control_plane_api/app/models/llm_model.py +75 -0
- control_plane_api/app/models/presence.py +49 -0
- control_plane_api/app/models/project.py +47 -0
- control_plane_api/app/models/session.py +38 -0
- control_plane_api/app/models/team.py +66 -0
- control_plane_api/app/models/workflow.py +55 -0
- control_plane_api/app/policies/README.md +121 -0
- control_plane_api/app/policies/approved_users.rego +62 -0
- control_plane_api/app/policies/business_hours.rego +51 -0
- control_plane_api/app/policies/rate_limiting.rego +100 -0
- control_plane_api/app/policies/tool_restrictions.rego +86 -0
- control_plane_api/app/routers/__init__.py +4 -0
- control_plane_api/app/routers/agents.py +364 -0
- control_plane_api/app/routers/agents_v2.py +1260 -0
- control_plane_api/app/routers/analytics.py +1014 -0
- control_plane_api/app/routers/context_manager.py +562 -0
- control_plane_api/app/routers/environment_context.py +270 -0
- control_plane_api/app/routers/environments.py +715 -0
- control_plane_api/app/routers/execution_environment.py +517 -0
- control_plane_api/app/routers/executions.py +1911 -0
- control_plane_api/app/routers/health.py +92 -0
- control_plane_api/app/routers/health_v2.py +326 -0
- control_plane_api/app/routers/integrations.py +274 -0
- control_plane_api/app/routers/jobs.py +1344 -0
- control_plane_api/app/routers/models.py +82 -0
- control_plane_api/app/routers/models_v2.py +361 -0
- control_plane_api/app/routers/policies.py +639 -0
- control_plane_api/app/routers/presence.py +234 -0
- control_plane_api/app/routers/projects.py +902 -0
- control_plane_api/app/routers/runners.py +379 -0
- control_plane_api/app/routers/runtimes.py +172 -0
- control_plane_api/app/routers/secrets.py +155 -0
- control_plane_api/app/routers/skills.py +1001 -0
- control_plane_api/app/routers/skills_definitions.py +140 -0
- control_plane_api/app/routers/task_planning.py +1256 -0
- control_plane_api/app/routers/task_queues.py +654 -0
- control_plane_api/app/routers/team_context.py +270 -0
- control_plane_api/app/routers/teams.py +1400 -0
- control_plane_api/app/routers/worker_queues.py +1545 -0
- control_plane_api/app/routers/workers.py +935 -0
- control_plane_api/app/routers/workflows.py +204 -0
- control_plane_api/app/runtimes/__init__.py +6 -0
- control_plane_api/app/runtimes/validation.py +344 -0
- control_plane_api/app/schemas/job_schemas.py +295 -0
- control_plane_api/app/services/__init__.py +1 -0
- control_plane_api/app/services/agno_service.py +619 -0
- control_plane_api/app/services/litellm_service.py +190 -0
- control_plane_api/app/services/policy_service.py +525 -0
- control_plane_api/app/services/temporal_cloud_provisioning.py +150 -0
- control_plane_api/app/skills/__init__.py +44 -0
- control_plane_api/app/skills/base.py +229 -0
- control_plane_api/app/skills/business_intelligence.py +189 -0
- control_plane_api/app/skills/data_visualization.py +154 -0
- control_plane_api/app/skills/docker.py +104 -0
- control_plane_api/app/skills/file_generation.py +94 -0
- control_plane_api/app/skills/file_system.py +110 -0
- control_plane_api/app/skills/python.py +92 -0
- control_plane_api/app/skills/registry.py +65 -0
- control_plane_api/app/skills/shell.py +102 -0
- control_plane_api/app/skills/workflow_executor.py +469 -0
- control_plane_api/app/utils/workflow_executor.py +354 -0
- control_plane_api/app/workflows/__init__.py +11 -0
- control_plane_api/app/workflows/agent_execution.py +507 -0
- control_plane_api/app/workflows/agent_execution_with_skills.py +222 -0
- control_plane_api/app/workflows/namespace_provisioning.py +326 -0
- control_plane_api/app/workflows/team_execution.py +399 -0
- control_plane_api/scripts/seed_models.py +239 -0
- control_plane_api/worker/__init__.py +0 -0
- control_plane_api/worker/activities/__init__.py +0 -0
- control_plane_api/worker/activities/agent_activities.py +1241 -0
- control_plane_api/worker/activities/approval_activities.py +234 -0
- control_plane_api/worker/activities/runtime_activities.py +388 -0
- control_plane_api/worker/activities/skill_activities.py +267 -0
- control_plane_api/worker/activities/team_activities.py +1217 -0
- control_plane_api/worker/config/__init__.py +31 -0
- control_plane_api/worker/config/worker_config.py +275 -0
- control_plane_api/worker/control_plane_client.py +529 -0
- control_plane_api/worker/examples/analytics_integration_example.py +362 -0
- control_plane_api/worker/models/__init__.py +1 -0
- control_plane_api/worker/models/inputs.py +89 -0
- control_plane_api/worker/runtimes/__init__.py +31 -0
- control_plane_api/worker/runtimes/base.py +789 -0
- control_plane_api/worker/runtimes/claude_code_runtime.py +1443 -0
- control_plane_api/worker/runtimes/default_runtime.py +617 -0
- control_plane_api/worker/runtimes/factory.py +173 -0
- control_plane_api/worker/runtimes/validation.py +93 -0
- control_plane_api/worker/services/__init__.py +1 -0
- control_plane_api/worker/services/agent_executor.py +422 -0
- control_plane_api/worker/services/agent_executor_v2.py +383 -0
- control_plane_api/worker/services/analytics_collector.py +457 -0
- control_plane_api/worker/services/analytics_service.py +464 -0
- control_plane_api/worker/services/approval_tools.py +310 -0
- control_plane_api/worker/services/approval_tools_agno.py +207 -0
- control_plane_api/worker/services/cancellation_manager.py +177 -0
- control_plane_api/worker/services/data_visualization.py +827 -0
- control_plane_api/worker/services/jira_tools.py +257 -0
- control_plane_api/worker/services/runtime_analytics.py +328 -0
- control_plane_api/worker/services/session_service.py +194 -0
- control_plane_api/worker/services/skill_factory.py +175 -0
- control_plane_api/worker/services/team_executor.py +574 -0
- control_plane_api/worker/services/team_executor_v2.py +465 -0
- control_plane_api/worker/services/workflow_executor_tools.py +1418 -0
- control_plane_api/worker/tests/__init__.py +1 -0
- control_plane_api/worker/tests/e2e/__init__.py +0 -0
- control_plane_api/worker/tests/e2e/test_execution_flow.py +571 -0
- control_plane_api/worker/tests/integration/__init__.py +0 -0
- control_plane_api/worker/tests/integration/test_control_plane_integration.py +308 -0
- control_plane_api/worker/tests/unit/__init__.py +0 -0
- control_plane_api/worker/tests/unit/test_control_plane_client.py +401 -0
- control_plane_api/worker/utils/__init__.py +1 -0
- control_plane_api/worker/utils/chunk_batcher.py +305 -0
- control_plane_api/worker/utils/retry_utils.py +60 -0
- control_plane_api/worker/utils/streaming_utils.py +373 -0
- control_plane_api/worker/worker.py +753 -0
- control_plane_api/worker/workflows/__init__.py +0 -0
- control_plane_api/worker/workflows/agent_execution.py +589 -0
- control_plane_api/worker/workflows/team_execution.py +429 -0
- kubiya_control_plane_api-0.3.4.dist-info/METADATA +229 -0
- kubiya_control_plane_api-0.3.4.dist-info/RECORD +182 -0
- kubiya_control_plane_api-0.3.4.dist-info/entry_points.txt +2 -0
- kubiya_control_plane_api-0.3.4.dist-info/top_level.txt +1 -0
- kubiya_control_plane_api-0.1.0.dist-info/METADATA +0 -66
- kubiya_control_plane_api-0.1.0.dist-info/RECORD +0 -5
- kubiya_control_plane_api-0.1.0.dist-info/top_level.txt +0 -1
- {kubiya_control_plane_api-0.1.0.dist-info/licenses → control_plane_api}/LICENSE +0 -0
- {kubiya_control_plane_api-0.1.0.dist-info → kubiya_control_plane_api-0.3.4.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,935 @@
|
|
|
1
|
+
"""Workers endpoint - shows registered Temporal workers and handles worker registration"""
|
|
2
|
+
|
|
3
|
+
from fastapi import APIRouter, Depends, HTTPException, status, Request
|
|
4
|
+
from typing import List, Dict, Any, Optional
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
import structlog
|
|
8
|
+
import uuid
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
from control_plane_api.app.middleware.auth import get_current_organization
|
|
12
|
+
from control_plane_api.app.lib.temporal_client import get_temporal_client
|
|
13
|
+
from control_plane_api.app.lib.supabase import get_supabase
|
|
14
|
+
from control_plane_api.app.lib.redis_client import get_redis_client
|
|
15
|
+
|
|
16
|
+
logger = structlog.get_logger()
|
|
17
|
+
|
|
18
|
+
router = APIRouter()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class WorkerInfo(BaseModel):
|
|
22
|
+
"""Worker information"""
|
|
23
|
+
identity: str
|
|
24
|
+
last_access_time: str | None
|
|
25
|
+
rate_per_second: float | None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TaskQueueInfo(BaseModel):
|
|
29
|
+
"""Task queue with worker information"""
|
|
30
|
+
task_queue: str
|
|
31
|
+
organization_id: str
|
|
32
|
+
runner_name: str
|
|
33
|
+
workers: List[WorkerInfo]
|
|
34
|
+
worker_count: int
|
|
35
|
+
approximate_backlog_count: int | None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@router.get("", response_model=List[TaskQueueInfo])
|
|
39
|
+
async def list_workers(
|
|
40
|
+
request: Request,
|
|
41
|
+
organization: dict = Depends(get_current_organization),
|
|
42
|
+
):
|
|
43
|
+
"""
|
|
44
|
+
List registered Temporal workers for the organization.
|
|
45
|
+
|
|
46
|
+
This queries Temporal to get all task queues for the organization
|
|
47
|
+
and returns information about registered workers on each queue.
|
|
48
|
+
|
|
49
|
+
Task queue naming convention: {organization_id}.{runner_name}
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
temporal_client = await get_temporal_client()
|
|
53
|
+
org_id = organization["id"]
|
|
54
|
+
|
|
55
|
+
# Get runners from Kubiya API to know which task queues to check
|
|
56
|
+
from control_plane_api.app.lib.kubiya_client import get_kubiya_client
|
|
57
|
+
kubiya_client = get_kubiya_client()
|
|
58
|
+
token = request.state.kubiya_token
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
runners = await kubiya_client.get_runners(token, org_id)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.warning(
|
|
64
|
+
"failed_to_fetch_kubiya_runners",
|
|
65
|
+
error=str(e),
|
|
66
|
+
org_id=org_id
|
|
67
|
+
)
|
|
68
|
+
# If we can't get runners from Kubiya, fall back to checking common ones
|
|
69
|
+
runners = [{"name": "default"}]
|
|
70
|
+
|
|
71
|
+
environments_info = []
|
|
72
|
+
|
|
73
|
+
for runner in runners:
|
|
74
|
+
# Runner might be a dict or a string
|
|
75
|
+
if isinstance(runner, dict):
|
|
76
|
+
runner_name = runner.get("name", "default")
|
|
77
|
+
else:
|
|
78
|
+
runner_name = str(runner) if runner else "default"
|
|
79
|
+
|
|
80
|
+
task_queue = f"{org_id}.{runner_name}"
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
# Describe the task queue to get worker information
|
|
84
|
+
desc = await temporal_client.describe_task_queue(
|
|
85
|
+
task_queue=task_queue,
|
|
86
|
+
task_queue_type=1, # TaskQueueType.WORKFLOW
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
workers = []
|
|
90
|
+
approximate_backlog = None
|
|
91
|
+
|
|
92
|
+
# Extract worker information from pollers
|
|
93
|
+
if desc.pollers:
|
|
94
|
+
for poller in desc.pollers:
|
|
95
|
+
worker_info = WorkerInfo(
|
|
96
|
+
identity=poller.identity,
|
|
97
|
+
last_access_time=poller.last_access_time.isoformat() if poller.last_access_time else None,
|
|
98
|
+
rate_per_second=poller.rate_per_second if hasattr(poller, 'rate_per_second') else None,
|
|
99
|
+
)
|
|
100
|
+
workers.append(worker_info)
|
|
101
|
+
|
|
102
|
+
# Get approximate backlog count if available
|
|
103
|
+
if hasattr(desc, 'approximate_backlog_count'):
|
|
104
|
+
approximate_backlog = desc.approximate_backlog_count
|
|
105
|
+
|
|
106
|
+
task_queue_info = TaskQueueInfo(
|
|
107
|
+
task_queue=task_queue,
|
|
108
|
+
organization_id=org_id,
|
|
109
|
+
runner_name=runner_name,
|
|
110
|
+
workers=workers,
|
|
111
|
+
worker_count=len(workers),
|
|
112
|
+
approximate_backlog_count=approximate_backlog,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
environments_info.append(task_queue_info)
|
|
116
|
+
|
|
117
|
+
logger.info(
|
|
118
|
+
"task_queue_described",
|
|
119
|
+
task_queue=task_queue,
|
|
120
|
+
worker_count=len(workers),
|
|
121
|
+
org_id=org_id,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
# Task queue might not exist yet if no worker has registered
|
|
126
|
+
logger.debug(
|
|
127
|
+
"task_queue_not_found",
|
|
128
|
+
task_queue=task_queue,
|
|
129
|
+
error=str(e),
|
|
130
|
+
org_id=org_id,
|
|
131
|
+
)
|
|
132
|
+
# Add empty task queue info
|
|
133
|
+
task_queue_info = TaskQueueInfo(
|
|
134
|
+
task_queue=task_queue,
|
|
135
|
+
organization_id=org_id,
|
|
136
|
+
runner_name=runner_name,
|
|
137
|
+
workers=[],
|
|
138
|
+
worker_count=0,
|
|
139
|
+
approximate_backlog_count=None,
|
|
140
|
+
)
|
|
141
|
+
environments_info.append(task_queue_info)
|
|
142
|
+
|
|
143
|
+
logger.info(
|
|
144
|
+
"workers_listed",
|
|
145
|
+
org_id=org_id,
|
|
146
|
+
task_queue_count=len(environments_info),
|
|
147
|
+
total_workers=sum(tq.worker_count for tq in environments_info),
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return environments_info
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(
|
|
154
|
+
"workers_list_failed",
|
|
155
|
+
error=str(e),
|
|
156
|
+
org_id=organization["id"]
|
|
157
|
+
)
|
|
158
|
+
raise HTTPException(
|
|
159
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
160
|
+
detail=f"Failed to list workers: {str(e)}"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@router.get("/{runner_name}", response_model=TaskQueueInfo)
|
|
165
|
+
async def get_workers_for_runner(
|
|
166
|
+
runner_name: str,
|
|
167
|
+
request: Request,
|
|
168
|
+
organization: dict = Depends(get_current_organization),
|
|
169
|
+
):
|
|
170
|
+
"""
|
|
171
|
+
Get worker information for a specific runner.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
runner_name: The runner name (e.g., "default", "production-runner")
|
|
175
|
+
"""
|
|
176
|
+
try:
|
|
177
|
+
temporal_client = await get_temporal_client()
|
|
178
|
+
org_id = organization["id"]
|
|
179
|
+
task_queue = f"{org_id}.{runner_name}"
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
# Describe the task queue
|
|
183
|
+
desc = await temporal_client.describe_task_queue(
|
|
184
|
+
task_queue=task_queue,
|
|
185
|
+
task_queue_type=1, # TaskQueueType.WORKFLOW
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
workers = []
|
|
189
|
+
approximate_backlog = None
|
|
190
|
+
|
|
191
|
+
# Extract worker information
|
|
192
|
+
if desc.pollers:
|
|
193
|
+
for poller in desc.pollers:
|
|
194
|
+
worker_info = WorkerInfo(
|
|
195
|
+
identity=poller.identity,
|
|
196
|
+
last_access_time=poller.last_access_time.isoformat() if poller.last_access_time else None,
|
|
197
|
+
rate_per_second=poller.rate_per_second if hasattr(poller, 'rate_per_second') else None,
|
|
198
|
+
)
|
|
199
|
+
workers.append(worker_info)
|
|
200
|
+
|
|
201
|
+
if hasattr(desc, 'approximate_backlog_count'):
|
|
202
|
+
approximate_backlog = desc.approximate_backlog_count
|
|
203
|
+
|
|
204
|
+
task_queue_info = TaskQueueInfo(
|
|
205
|
+
task_queue=task_queue,
|
|
206
|
+
organization_id=org_id,
|
|
207
|
+
runner_name=runner_name,
|
|
208
|
+
workers=workers,
|
|
209
|
+
worker_count=len(workers),
|
|
210
|
+
approximate_backlog_count=approximate_backlog,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
logger.info(
|
|
214
|
+
"workers_fetched_for_runner",
|
|
215
|
+
runner_name=runner_name,
|
|
216
|
+
worker_count=len(workers),
|
|
217
|
+
org_id=org_id,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
return task_queue_info
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.warning(
|
|
224
|
+
"task_queue_not_found",
|
|
225
|
+
task_queue=task_queue,
|
|
226
|
+
error=str(e),
|
|
227
|
+
org_id=org_id,
|
|
228
|
+
)
|
|
229
|
+
# Return empty worker info if task queue doesn't exist
|
|
230
|
+
return TaskQueueInfo(
|
|
231
|
+
task_queue=task_queue,
|
|
232
|
+
organization_id=org_id,
|
|
233
|
+
runner_name=runner_name,
|
|
234
|
+
workers=[],
|
|
235
|
+
worker_count=0,
|
|
236
|
+
approximate_backlog_count=None,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
except Exception as e:
|
|
240
|
+
logger.error(
|
|
241
|
+
"workers_fetch_failed",
|
|
242
|
+
error=str(e),
|
|
243
|
+
runner_name=runner_name,
|
|
244
|
+
org_id=organization["id"]
|
|
245
|
+
)
|
|
246
|
+
raise HTTPException(
|
|
247
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
248
|
+
detail=f"Failed to fetch workers: {str(e)}"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# Worker Registration for Decoupled Architecture
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class WorkerRegistrationRequest(BaseModel):
|
|
256
|
+
"""Worker registration request"""
|
|
257
|
+
environment_name: str # Task queue / environment name worker wants to join
|
|
258
|
+
hostname: Optional[str] = None
|
|
259
|
+
worker_metadata: Dict[str, Any] = {}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class WorkerRegistrationResponse(BaseModel):
|
|
263
|
+
"""Worker registration response with all config needed"""
|
|
264
|
+
worker_id: str # Unique worker ID
|
|
265
|
+
worker_token: str # Token for this worker (from environment)
|
|
266
|
+
environment_name: str # Task queue name (format: org_id.environment)
|
|
267
|
+
temporal_namespace: str
|
|
268
|
+
temporal_host: str
|
|
269
|
+
temporal_api_key: str
|
|
270
|
+
organization_id: str
|
|
271
|
+
control_plane_url: str
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class WorkerHeartbeatRequest(BaseModel):
|
|
275
|
+
"""Worker heartbeat request"""
|
|
276
|
+
worker_id: str
|
|
277
|
+
environment_name: str
|
|
278
|
+
status: str = "active" # active, idle, busy
|
|
279
|
+
tasks_processed: int = 0
|
|
280
|
+
current_task_id: Optional[str] = None
|
|
281
|
+
worker_metadata: Dict[str, Any] = {}
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
@router.post("/register", response_model=WorkerRegistrationResponse)
|
|
285
|
+
async def register_worker(
|
|
286
|
+
registration: WorkerRegistrationRequest,
|
|
287
|
+
request: Request,
|
|
288
|
+
organization: dict = Depends(get_current_organization),
|
|
289
|
+
):
|
|
290
|
+
"""
|
|
291
|
+
Register a new worker with the control plane.
|
|
292
|
+
|
|
293
|
+
This endpoint is called by workers on startup to get their configuration.
|
|
294
|
+
The worker authenticates using KUBIYA_API_KEY (same auth as other API calls).
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
All configuration needed for worker to connect to Temporal and operate:
|
|
298
|
+
- worker_id: Unique ID for this worker instance
|
|
299
|
+
- worker_token: Environment's worker token
|
|
300
|
+
- environment_name: Formatted task queue name (org_id.environment)
|
|
301
|
+
- temporal_namespace, temporal_host, temporal_api_key: Temporal Cloud config
|
|
302
|
+
- organization_id: Organization ID
|
|
303
|
+
- control_plane_url: URL to send heartbeats
|
|
304
|
+
"""
|
|
305
|
+
try:
|
|
306
|
+
client = get_supabase()
|
|
307
|
+
org_id = organization["id"]
|
|
308
|
+
|
|
309
|
+
# Look up the environment by name
|
|
310
|
+
env_result = (
|
|
311
|
+
client.table("environments")
|
|
312
|
+
.select("*")
|
|
313
|
+
.eq("organization_id", org_id)
|
|
314
|
+
.eq("name", registration.environment_name)
|
|
315
|
+
.execute()
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# If environment doesn't exist, create it
|
|
319
|
+
if not env_result.data or len(env_result.data) == 0:
|
|
320
|
+
logger.info(
|
|
321
|
+
"creating_environment_for_worker",
|
|
322
|
+
environment_name=registration.environment_name,
|
|
323
|
+
org_id=org_id,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Generate worker token for this environment (UUID format)
|
|
327
|
+
worker_token = str(uuid.uuid4())
|
|
328
|
+
|
|
329
|
+
# Create the environment
|
|
330
|
+
new_env = {
|
|
331
|
+
"id": str(uuid.uuid4()),
|
|
332
|
+
"organization_id": org_id,
|
|
333
|
+
"name": registration.environment_name,
|
|
334
|
+
"worker_token": worker_token,
|
|
335
|
+
"status": "active", # Mark as active immediately
|
|
336
|
+
"created_at": datetime.utcnow().isoformat(),
|
|
337
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
env_create_result = (
|
|
341
|
+
client.table("environments")
|
|
342
|
+
.insert(new_env)
|
|
343
|
+
.execute()
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
environment = env_create_result.data[0]
|
|
347
|
+
|
|
348
|
+
logger.info(
|
|
349
|
+
"environment_created_for_worker",
|
|
350
|
+
environment_name=registration.environment_name,
|
|
351
|
+
environment_id=environment["id"],
|
|
352
|
+
org_id=org_id,
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
environment = env_result.data[0]
|
|
356
|
+
|
|
357
|
+
# Check if environment is ready
|
|
358
|
+
if environment.get("status") not in ["ready", "active"]:
|
|
359
|
+
raise HTTPException(
|
|
360
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
361
|
+
detail=f"Environment is not ready (status: {environment.get('status')}). "
|
|
362
|
+
f"Please wait for provisioning to complete."
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# TEMPORARY: Skip provisioning and use fixed namespace + admin API key
|
|
366
|
+
# Get temporal namespace for this organization
|
|
367
|
+
import os
|
|
368
|
+
|
|
369
|
+
# Use fixed namespace for testing
|
|
370
|
+
namespace = {
|
|
371
|
+
"namespace_name": "agent-control-plane.lpagu",
|
|
372
|
+
"api_key_encrypted": os.getenv("TEMPORAL_CLOUD_ADMIN_TOKEN", ""),
|
|
373
|
+
"status": "ready"
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
logger.info(
|
|
377
|
+
"using_fixed_namespace_for_testing",
|
|
378
|
+
namespace_name=namespace["namespace_name"],
|
|
379
|
+
org_id=org_id,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Generate worker ID
|
|
383
|
+
worker_id = str(uuid.uuid4())
|
|
384
|
+
|
|
385
|
+
# Create worker record in database
|
|
386
|
+
worker_record = {
|
|
387
|
+
"id": worker_id, # Use id as primary key
|
|
388
|
+
"worker_id": worker_id, # Also set worker_id (has NOT NULL constraint)
|
|
389
|
+
"organization_id": org_id,
|
|
390
|
+
"environment_name": registration.environment_name,
|
|
391
|
+
"worker_token": environment.get("worker_token"),
|
|
392
|
+
"hostname": registration.hostname,
|
|
393
|
+
"worker_metadata": registration.worker_metadata,
|
|
394
|
+
"status": "active",
|
|
395
|
+
"tasks_processed": 0,
|
|
396
|
+
"registered_at": datetime.utcnow().isoformat(),
|
|
397
|
+
"last_heartbeat": datetime.utcnow().isoformat(),
|
|
398
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
client.table("worker_heartbeats").insert(worker_record).execute()
|
|
402
|
+
|
|
403
|
+
# Format task queue name: org_id.environment_name
|
|
404
|
+
task_queue_name = f"{org_id}.{registration.environment_name}"
|
|
405
|
+
|
|
406
|
+
# Get Temporal Cloud configuration
|
|
407
|
+
import os
|
|
408
|
+
temporal_host = os.getenv("TEMPORAL_HOST", "us-east-1.aws.api.temporal.io:7233")
|
|
409
|
+
|
|
410
|
+
# Decrypt API key from namespace (TODO: implement proper decryption)
|
|
411
|
+
temporal_api_key = namespace.get("api_key_encrypted", "")
|
|
412
|
+
|
|
413
|
+
# Get control plane URL from environment or construct from request
|
|
414
|
+
control_plane_url = os.getenv("CONTROL_PLANE_URL")
|
|
415
|
+
if not control_plane_url:
|
|
416
|
+
# Construct from request if not set
|
|
417
|
+
control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
|
|
418
|
+
|
|
419
|
+
logger.info(
|
|
420
|
+
"worker_registered",
|
|
421
|
+
worker_id=worker_id,
|
|
422
|
+
environment_name=registration.environment_name,
|
|
423
|
+
task_queue=task_queue_name,
|
|
424
|
+
org_id=org_id,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
return WorkerRegistrationResponse(
|
|
428
|
+
worker_id=worker_id,
|
|
429
|
+
worker_token=environment.get("worker_token"),
|
|
430
|
+
environment_name=task_queue_name, # Return formatted name
|
|
431
|
+
temporal_namespace=namespace.get("namespace_name"),
|
|
432
|
+
temporal_host=temporal_host,
|
|
433
|
+
temporal_api_key=temporal_api_key,
|
|
434
|
+
organization_id=org_id,
|
|
435
|
+
control_plane_url=control_plane_url,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
except HTTPException:
|
|
439
|
+
raise
|
|
440
|
+
except Exception as e:
|
|
441
|
+
logger.error(
|
|
442
|
+
"worker_registration_failed",
|
|
443
|
+
error=str(e),
|
|
444
|
+
environment_name=registration.environment_name,
|
|
445
|
+
org_id=organization["id"]
|
|
446
|
+
)
|
|
447
|
+
raise HTTPException(
|
|
448
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
449
|
+
detail=f"Failed to register worker: {str(e)}"
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
@router.post("/heartbeat", status_code=status.HTTP_204_NO_CONTENT)
|
|
454
|
+
async def worker_heartbeat(
|
|
455
|
+
heartbeat: WorkerHeartbeatRequest,
|
|
456
|
+
request: Request,
|
|
457
|
+
organization: dict = Depends(get_current_organization),
|
|
458
|
+
):
|
|
459
|
+
"""
|
|
460
|
+
Receive heartbeat from a worker.
|
|
461
|
+
|
|
462
|
+
OPTIMIZATION: Uses Redis for scalable heartbeat storage instead of database.
|
|
463
|
+
Database writes are expensive and heartbeats happen every 30s per worker.
|
|
464
|
+
|
|
465
|
+
Workers should call this endpoint periodically (e.g., every 30 seconds) to:
|
|
466
|
+
- Confirm they're still alive
|
|
467
|
+
- Update their status (active, idle, busy)
|
|
468
|
+
- Report tasks processed
|
|
469
|
+
- Update metadata
|
|
470
|
+
"""
|
|
471
|
+
try:
|
|
472
|
+
org_id = organization["id"]
|
|
473
|
+
redis_client = get_redis_client()
|
|
474
|
+
|
|
475
|
+
if not redis_client:
|
|
476
|
+
# Redis not available - log warning but don't fail (graceful degradation)
|
|
477
|
+
logger.warning(
|
|
478
|
+
"worker_heartbeat_redis_unavailable",
|
|
479
|
+
worker_id=heartbeat.worker_id,
|
|
480
|
+
org_id=org_id,
|
|
481
|
+
)
|
|
482
|
+
return None
|
|
483
|
+
|
|
484
|
+
# Build heartbeat data for Redis
|
|
485
|
+
heartbeat_data = {
|
|
486
|
+
"worker_id": heartbeat.worker_id,
|
|
487
|
+
"organization_id": org_id,
|
|
488
|
+
"environment_name": heartbeat.environment_name,
|
|
489
|
+
"status": heartbeat.status,
|
|
490
|
+
"tasks_processed": heartbeat.tasks_processed,
|
|
491
|
+
"current_task_id": heartbeat.current_task_id,
|
|
492
|
+
"last_heartbeat": datetime.utcnow().isoformat(),
|
|
493
|
+
"metadata": heartbeat.worker_metadata,
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
# Store in Redis with 5-minute TTL (if worker crashes, heartbeat expires)
|
|
497
|
+
redis_key = f"worker:{heartbeat.worker_id}:heartbeat"
|
|
498
|
+
await redis_client.set(redis_key, json.dumps(heartbeat_data), ex=300)
|
|
499
|
+
|
|
500
|
+
logger.debug(
|
|
501
|
+
"worker_heartbeat_received",
|
|
502
|
+
worker_id=heartbeat.worker_id,
|
|
503
|
+
status=heartbeat.status,
|
|
504
|
+
environment_name=heartbeat.environment_name,
|
|
505
|
+
org_id=org_id,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
return None
|
|
509
|
+
|
|
510
|
+
except Exception as e:
|
|
511
|
+
logger.error(
|
|
512
|
+
"worker_heartbeat_failed",
|
|
513
|
+
error=str(e),
|
|
514
|
+
worker_id=heartbeat.worker_id,
|
|
515
|
+
org_id=organization["id"]
|
|
516
|
+
)
|
|
517
|
+
# Don't fail the worker if heartbeat fails - graceful degradation
|
|
518
|
+
return None
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
# Worker ID-based endpoints (new architecture)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
class WorkerStartRequest(BaseModel):
|
|
525
|
+
"""Request to start a worker and fetch its config"""
|
|
526
|
+
system_info: Dict[str, Any] = {}
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
class WorkerConfigResponse(BaseModel):
|
|
530
|
+
"""Worker configuration response"""
|
|
531
|
+
worker_id: str
|
|
532
|
+
worker_queue_name: str
|
|
533
|
+
environment_name: str
|
|
534
|
+
task_queue_name: str # Full: org.env.worker_queue
|
|
535
|
+
temporal_namespace: str
|
|
536
|
+
temporal_host: str
|
|
537
|
+
temporal_api_key: str
|
|
538
|
+
organization_id: str
|
|
539
|
+
control_plane_url: str
|
|
540
|
+
heartbeat_interval: int = 30
|
|
541
|
+
# LiteLLM configuration
|
|
542
|
+
litellm_api_url: str
|
|
543
|
+
litellm_api_key: str
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
class WorkerSystemInfo(BaseModel):
|
|
547
|
+
"""Worker system information"""
|
|
548
|
+
hostname: Optional[str] = None
|
|
549
|
+
platform: Optional[str] = None
|
|
550
|
+
os_name: Optional[str] = None
|
|
551
|
+
os_version: Optional[str] = None
|
|
552
|
+
python_version: Optional[str] = None
|
|
553
|
+
cli_version: Optional[str] = None
|
|
554
|
+
docker_available: Optional[bool] = None
|
|
555
|
+
docker_version: Optional[str] = None
|
|
556
|
+
cpu_count: Optional[int] = None
|
|
557
|
+
cpu_percent: Optional[float] = None
|
|
558
|
+
memory_total: Optional[int] = None # bytes
|
|
559
|
+
memory_used: Optional[int] = None # bytes
|
|
560
|
+
memory_percent: Optional[float] = None
|
|
561
|
+
disk_total: Optional[int] = None # bytes
|
|
562
|
+
disk_used: Optional[int] = None # bytes
|
|
563
|
+
disk_percent: Optional[float] = None
|
|
564
|
+
uptime_seconds: Optional[float] = None
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
class WorkerHeartbeatSimple(BaseModel):
|
|
568
|
+
"""Simplified heartbeat request (worker_id in URL)"""
|
|
569
|
+
status: str = "active"
|
|
570
|
+
tasks_processed: int = 0
|
|
571
|
+
current_task_id: Optional[str] = None
|
|
572
|
+
worker_metadata: Dict[str, Any] = {}
|
|
573
|
+
system_info: Optional[WorkerSystemInfo] = None
|
|
574
|
+
logs: Optional[List[str]] = None # Recent log lines since last heartbeat
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
@router.post("/{worker_id}/start", response_model=WorkerConfigResponse)
|
|
578
|
+
async def start_worker(
|
|
579
|
+
worker_id: str,
|
|
580
|
+
start_request: WorkerStartRequest,
|
|
581
|
+
request: Request,
|
|
582
|
+
organization: dict = Depends(get_current_organization),
|
|
583
|
+
):
|
|
584
|
+
"""
|
|
585
|
+
Start a worker and fetch its configuration.
|
|
586
|
+
|
|
587
|
+
This endpoint is called by workers on startup with just worker_id and API key.
|
|
588
|
+
It returns all necessary configuration for the worker to connect to Temporal.
|
|
589
|
+
|
|
590
|
+
Args:
|
|
591
|
+
worker_id: Worker ID (UUID created in UI)
|
|
592
|
+
start_request: System information from worker
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
Complete worker configuration including Temporal credentials
|
|
596
|
+
"""
|
|
597
|
+
try:
|
|
598
|
+
client = get_supabase()
|
|
599
|
+
org_id = organization["id"]
|
|
600
|
+
|
|
601
|
+
# Look up worker in database
|
|
602
|
+
worker_result = (
|
|
603
|
+
client.table("worker_heartbeats")
|
|
604
|
+
.select("*")
|
|
605
|
+
.eq("id", worker_id)
|
|
606
|
+
.eq("organization_id", org_id)
|
|
607
|
+
.single()
|
|
608
|
+
.execute()
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
if not worker_result.data:
|
|
612
|
+
raise HTTPException(
|
|
613
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
614
|
+
detail=f"Worker '{worker_id}' not found"
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
worker = worker_result.data
|
|
618
|
+
|
|
619
|
+
# Get worker queue separately
|
|
620
|
+
if not worker.get("worker_queue_id"):
|
|
621
|
+
raise HTTPException(
|
|
622
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
623
|
+
detail=f"Worker has no queue assigned"
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
queue_result = (
|
|
627
|
+
client.table("worker_queues")
|
|
628
|
+
.select("*")
|
|
629
|
+
.eq("id", worker["worker_queue_id"])
|
|
630
|
+
.eq("organization_id", org_id)
|
|
631
|
+
.single()
|
|
632
|
+
.execute()
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
if not queue_result.data:
|
|
636
|
+
raise HTTPException(
|
|
637
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
638
|
+
detail=f"Worker queue not found"
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
worker_queue = queue_result.data
|
|
642
|
+
worker_queue_name = worker_queue["name"]
|
|
643
|
+
|
|
644
|
+
# Get environment separately
|
|
645
|
+
environment_name = "default"
|
|
646
|
+
if worker_queue.get("environment_id"):
|
|
647
|
+
env_result = (
|
|
648
|
+
client.table("environments")
|
|
649
|
+
.select("name")
|
|
650
|
+
.eq("id", worker_queue["environment_id"])
|
|
651
|
+
.eq("organization_id", org_id)
|
|
652
|
+
.maybe_single()
|
|
653
|
+
.execute()
|
|
654
|
+
)
|
|
655
|
+
if env_result.data:
|
|
656
|
+
environment_name = env_result.data["name"]
|
|
657
|
+
|
|
658
|
+
# TEMPORARY: Skip database lookup and use fixed namespace + admin API key
|
|
659
|
+
import os
|
|
660
|
+
|
|
661
|
+
# Use fixed namespace for testing
|
|
662
|
+
namespace = {
|
|
663
|
+
"namespace_name": "agent-control-plane.lpagu",
|
|
664
|
+
"api_key_encrypted": os.getenv("TEMPORAL_CLOUD_ADMIN_TOKEN", ""),
|
|
665
|
+
"status": "ready"
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
logger.info(
|
|
669
|
+
"using_fixed_namespace_for_testing",
|
|
670
|
+
namespace_name=namespace["namespace_name"],
|
|
671
|
+
worker_id=worker_id,
|
|
672
|
+
org_id=org_id,
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
# Update worker with system info and mark as starting
|
|
676
|
+
update_data = {
|
|
677
|
+
"worker_metadata": {
|
|
678
|
+
**worker.get("worker_metadata", {}),
|
|
679
|
+
**start_request.system_info,
|
|
680
|
+
"last_start": datetime.utcnow().isoformat(),
|
|
681
|
+
},
|
|
682
|
+
"status": "active",
|
|
683
|
+
"last_heartbeat": datetime.utcnow().isoformat(),
|
|
684
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
client.table("worker_heartbeats").update(update_data).eq("id", worker_id).execute()
|
|
688
|
+
|
|
689
|
+
# Build full task queue name
|
|
690
|
+
task_queue_name = f"{org_id}.{environment_name}.{worker_queue_name}"
|
|
691
|
+
|
|
692
|
+
# Get Temporal Cloud configuration
|
|
693
|
+
import os
|
|
694
|
+
temporal_host = os.getenv("TEMPORAL_HOST", "us-east-1.aws.api.temporal.io:7233")
|
|
695
|
+
temporal_api_key = namespace.get("api_key_encrypted", "")
|
|
696
|
+
|
|
697
|
+
# Get control plane URL
|
|
698
|
+
control_plane_url = os.getenv("CONTROL_PLANE_URL")
|
|
699
|
+
if not control_plane_url:
|
|
700
|
+
control_plane_url = f"{request.url.scheme}://{request.url.netloc}"
|
|
701
|
+
|
|
702
|
+
# Get LiteLLM configuration from environment
|
|
703
|
+
litellm_api_url = os.getenv("LITELLM_API_URL", "https://api.openai.com/v1")
|
|
704
|
+
litellm_api_key = os.getenv("LITELLM_API_KEY", "")
|
|
705
|
+
|
|
706
|
+
logger.info(
|
|
707
|
+
"worker_config_fetched",
|
|
708
|
+
worker_id=worker_id,
|
|
709
|
+
task_queue=task_queue_name,
|
|
710
|
+
environment=environment_name,
|
|
711
|
+
worker_queue=worker_queue_name,
|
|
712
|
+
org_id=org_id,
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
return WorkerConfigResponse(
|
|
716
|
+
worker_id=worker_id,
|
|
717
|
+
worker_queue_name=worker_queue_name,
|
|
718
|
+
environment_name=environment_name,
|
|
719
|
+
task_queue_name=task_queue_name,
|
|
720
|
+
temporal_namespace=namespace.get("namespace_name"),
|
|
721
|
+
temporal_host=temporal_host,
|
|
722
|
+
temporal_api_key=temporal_api_key,
|
|
723
|
+
organization_id=org_id,
|
|
724
|
+
control_plane_url=control_plane_url,
|
|
725
|
+
heartbeat_interval=worker_queue.get("heartbeat_interval", 30),
|
|
726
|
+
litellm_api_url=litellm_api_url,
|
|
727
|
+
litellm_api_key=litellm_api_key,
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
except HTTPException:
|
|
731
|
+
raise
|
|
732
|
+
except Exception as e:
|
|
733
|
+
logger.error(
|
|
734
|
+
"worker_start_failed",
|
|
735
|
+
error=str(e),
|
|
736
|
+
worker_id=worker_id,
|
|
737
|
+
org_id=organization.get("id")
|
|
738
|
+
)
|
|
739
|
+
raise HTTPException(
|
|
740
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
741
|
+
detail=f"Failed to start worker: {str(e)}"
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
@router.post("/{worker_id}/heartbeat", status_code=status.HTTP_204_NO_CONTENT)
|
|
746
|
+
async def worker_heartbeat_simple(
|
|
747
|
+
worker_id: str,
|
|
748
|
+
heartbeat: WorkerHeartbeatSimple,
|
|
749
|
+
request: Request,
|
|
750
|
+
organization: dict = Depends(get_current_organization),
|
|
751
|
+
):
|
|
752
|
+
"""
|
|
753
|
+
Receive heartbeat from a worker (simplified version with worker_id in URL).
|
|
754
|
+
|
|
755
|
+
OPTIMIZATION: Uses Redis for scalable heartbeat storage instead of database.
|
|
756
|
+
Database writes are expensive and heartbeats happen every 30s per worker.
|
|
757
|
+
Redis provides sub-millisecond writes and automatic TTL expiration.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
worker_id: Worker ID (UUID)
|
|
761
|
+
heartbeat: Heartbeat data
|
|
762
|
+
"""
|
|
763
|
+
try:
|
|
764
|
+
org_id = organization["id"]
|
|
765
|
+
redis_client = get_redis_client()
|
|
766
|
+
|
|
767
|
+
if not redis_client:
|
|
768
|
+
# Redis not available - log warning but don't fail (graceful degradation)
|
|
769
|
+
logger.warning(
|
|
770
|
+
"worker_heartbeat_redis_unavailable",
|
|
771
|
+
worker_id=worker_id,
|
|
772
|
+
org_id=org_id,
|
|
773
|
+
)
|
|
774
|
+
return None
|
|
775
|
+
|
|
776
|
+
# Build heartbeat data for Redis
|
|
777
|
+
heartbeat_data = {
|
|
778
|
+
"worker_id": worker_id,
|
|
779
|
+
"organization_id": org_id,
|
|
780
|
+
"status": heartbeat.status,
|
|
781
|
+
"tasks_processed": heartbeat.tasks_processed,
|
|
782
|
+
"current_task_id": heartbeat.current_task_id,
|
|
783
|
+
"last_heartbeat": datetime.utcnow().isoformat(),
|
|
784
|
+
"metadata": heartbeat.worker_metadata,
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
if heartbeat.system_info:
|
|
788
|
+
heartbeat_data["system_info"] = heartbeat.system_info.dict(exclude_none=True)
|
|
789
|
+
|
|
790
|
+
# Handle logs - fetch from Redis and append new logs
|
|
791
|
+
redis_key = f"worker:{worker_id}:heartbeat"
|
|
792
|
+
if heartbeat.logs:
|
|
793
|
+
try:
|
|
794
|
+
# Get existing heartbeat data to retrieve logs
|
|
795
|
+
existing_data = await redis_client.get(redis_key)
|
|
796
|
+
if existing_data:
|
|
797
|
+
existing_heartbeat = json.loads(existing_data)
|
|
798
|
+
existing_logs = existing_heartbeat.get("logs", [])
|
|
799
|
+
all_logs = existing_logs + heartbeat.logs
|
|
800
|
+
heartbeat_data["logs"] = all_logs[-100:] # Keep last 100 lines
|
|
801
|
+
else:
|
|
802
|
+
heartbeat_data["logs"] = heartbeat.logs[-100:]
|
|
803
|
+
except Exception as log_error:
|
|
804
|
+
logger.warning("heartbeat_log_merge_failed", error=str(log_error))
|
|
805
|
+
heartbeat_data["logs"] = heartbeat.logs[-100:]
|
|
806
|
+
|
|
807
|
+
# Store in Redis with 5-minute TTL (if worker crashes, heartbeat expires)
|
|
808
|
+
# TTL is 10x the heartbeat interval (30s * 10 = 300s) for safety
|
|
809
|
+
await redis_client.set(redis_key, json.dumps(heartbeat_data), ex=300)
|
|
810
|
+
|
|
811
|
+
logger.debug(
|
|
812
|
+
"worker_heartbeat_received",
|
|
813
|
+
worker_id=worker_id,
|
|
814
|
+
status=heartbeat.status,
|
|
815
|
+
org_id=org_id,
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
return None
|
|
819
|
+
|
|
820
|
+
except Exception as e:
|
|
821
|
+
logger.error(
|
|
822
|
+
"worker_heartbeat_failed",
|
|
823
|
+
error=str(e),
|
|
824
|
+
worker_id=worker_id,
|
|
825
|
+
org_id=organization.get("id")
|
|
826
|
+
)
|
|
827
|
+
# Don't fail the worker if heartbeat fails - graceful degradation
|
|
828
|
+
return None
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
class WorkerDisconnectRequest(BaseModel):
|
|
832
|
+
"""Worker disconnect request"""
|
|
833
|
+
reason: str = "shutdown" # shutdown, error, crash, etc.
|
|
834
|
+
exit_code: Optional[int] = None
|
|
835
|
+
error_message: Optional[str] = None
|
|
836
|
+
|
|
837
|
+
|
|
838
|
+
@router.post("/{worker_id}/disconnect", status_code=status.HTTP_204_NO_CONTENT)
|
|
839
|
+
async def worker_disconnect(
|
|
840
|
+
worker_id: str,
|
|
841
|
+
disconnect: WorkerDisconnectRequest,
|
|
842
|
+
request: Request,
|
|
843
|
+
organization: dict = Depends(get_current_organization),
|
|
844
|
+
):
|
|
845
|
+
"""
|
|
846
|
+
Mark a worker as disconnected/offline.
|
|
847
|
+
|
|
848
|
+
This endpoint is called by workers when they:
|
|
849
|
+
- Shut down gracefully (Ctrl+C)
|
|
850
|
+
- Exit due to an error
|
|
851
|
+
- Crash unexpectedly (via atexit handler)
|
|
852
|
+
|
|
853
|
+
Args:
|
|
854
|
+
worker_id: Worker ID (UUID)
|
|
855
|
+
disconnect: Disconnect details (reason, exit code, error)
|
|
856
|
+
"""
|
|
857
|
+
try:
|
|
858
|
+
client = get_supabase()
|
|
859
|
+
org_id = organization["id"]
|
|
860
|
+
|
|
861
|
+
# Update worker status to disconnected in database
|
|
862
|
+
update_data = {
|
|
863
|
+
"status": "disconnected",
|
|
864
|
+
"last_heartbeat": datetime.utcnow().isoformat(),
|
|
865
|
+
"worker_metadata": {
|
|
866
|
+
"disconnect_reason": disconnect.reason,
|
|
867
|
+
"disconnect_time": datetime.utcnow().isoformat(),
|
|
868
|
+
"exit_code": disconnect.exit_code,
|
|
869
|
+
"error_message": disconnect.error_message,
|
|
870
|
+
},
|
|
871
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
result = (
|
|
875
|
+
client.table("worker_heartbeats")
|
|
876
|
+
.update(update_data)
|
|
877
|
+
.eq("id", worker_id)
|
|
878
|
+
.eq("organization_id", org_id)
|
|
879
|
+
.execute()
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
if not result.data:
|
|
883
|
+
logger.warning(
|
|
884
|
+
"worker_disconnect_not_found",
|
|
885
|
+
worker_id=worker_id,
|
|
886
|
+
org_id=org_id,
|
|
887
|
+
)
|
|
888
|
+
raise HTTPException(
|
|
889
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
890
|
+
detail="Worker not found"
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
# IMPORTANT: Also remove from Redis immediately so UI updates instantly
|
|
894
|
+
redis_client = get_redis_client()
|
|
895
|
+
if redis_client:
|
|
896
|
+
redis_key = f"worker:{worker_id}:heartbeat"
|
|
897
|
+
try:
|
|
898
|
+
# Delete the heartbeat key from Redis
|
|
899
|
+
await redis_client.delete(redis_key)
|
|
900
|
+
logger.info(
|
|
901
|
+
"worker_removed_from_redis",
|
|
902
|
+
worker_id=worker_id,
|
|
903
|
+
redis_key=redis_key
|
|
904
|
+
)
|
|
905
|
+
except Exception as redis_error:
|
|
906
|
+
# Log but don't fail the disconnect
|
|
907
|
+
logger.warning(
|
|
908
|
+
"redis_delete_failed",
|
|
909
|
+
error=str(redis_error),
|
|
910
|
+
worker_id=worker_id
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
logger.info(
|
|
914
|
+
"worker_disconnected",
|
|
915
|
+
worker_id=worker_id,
|
|
916
|
+
reason=disconnect.reason,
|
|
917
|
+
exit_code=disconnect.exit_code,
|
|
918
|
+
org_id=org_id,
|
|
919
|
+
)
|
|
920
|
+
|
|
921
|
+
return None
|
|
922
|
+
|
|
923
|
+
except HTTPException:
|
|
924
|
+
raise
|
|
925
|
+
except Exception as e:
|
|
926
|
+
logger.error(
|
|
927
|
+
"worker_disconnect_failed",
|
|
928
|
+
error=str(e),
|
|
929
|
+
worker_id=worker_id,
|
|
930
|
+
org_id=organization.get("id")
|
|
931
|
+
)
|
|
932
|
+
raise HTTPException(
|
|
933
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
934
|
+
detail=f"Failed to process disconnect: {str(e)}"
|
|
935
|
+
)
|