kubiya-control-plane-api 0.1.0__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kubiya-control-plane-api might be problematic. Click here for more details.
- control_plane_api/README.md +266 -0
- control_plane_api/__init__.py +0 -0
- control_plane_api/__version__.py +1 -0
- control_plane_api/alembic/README +1 -0
- control_plane_api/alembic/env.py +98 -0
- control_plane_api/alembic/script.py.mako +28 -0
- control_plane_api/alembic/versions/1382bec74309_initial_migration_with_all_models.py +251 -0
- control_plane_api/alembic/versions/1f54bc2a37e3_add_analytics_tables.py +162 -0
- control_plane_api/alembic/versions/2e4cb136dc10_rename_toolset_ids_to_skill_ids_in_teams.py +30 -0
- control_plane_api/alembic/versions/31cd69a644ce_add_skill_templates_table.py +28 -0
- control_plane_api/alembic/versions/89e127caa47d_add_jobs_and_job_executions_tables.py +161 -0
- control_plane_api/alembic/versions/add_llm_models_table.py +51 -0
- control_plane_api/alembic/versions/b0e10697f212_add_runtime_column_to_teams_simple.py +42 -0
- control_plane_api/alembic/versions/ce43b24b63bf_add_execution_trigger_source_and_fix_.py +155 -0
- control_plane_api/alembic/versions/d4eaf16e3f8d_rename_toolsets_to_skills.py +84 -0
- control_plane_api/alembic/versions/efa2dc427da1_rename_metadata_to_custom_metadata.py +32 -0
- control_plane_api/alembic/versions/f973b431d1ce_add_workflow_executor_to_skill_types.py +44 -0
- control_plane_api/alembic.ini +148 -0
- control_plane_api/api/index.py +12 -0
- control_plane_api/app/__init__.py +11 -0
- control_plane_api/app/activities/__init__.py +20 -0
- control_plane_api/app/activities/agent_activities.py +379 -0
- control_plane_api/app/activities/team_activities.py +410 -0
- control_plane_api/app/activities/temporal_cloud_activities.py +577 -0
- control_plane_api/app/config/__init__.py +35 -0
- control_plane_api/app/config/api_config.py +354 -0
- control_plane_api/app/config/model_pricing.py +318 -0
- control_plane_api/app/config.py +95 -0
- control_plane_api/app/database.py +135 -0
- control_plane_api/app/exceptions.py +408 -0
- control_plane_api/app/lib/__init__.py +11 -0
- control_plane_api/app/lib/job_executor.py +312 -0
- control_plane_api/app/lib/kubiya_client.py +235 -0
- control_plane_api/app/lib/litellm_pricing.py +166 -0
- control_plane_api/app/lib/planning_tools/__init__.py +22 -0
- control_plane_api/app/lib/planning_tools/agents.py +155 -0
- control_plane_api/app/lib/planning_tools/base.py +189 -0
- control_plane_api/app/lib/planning_tools/environments.py +214 -0
- control_plane_api/app/lib/planning_tools/resources.py +240 -0
- control_plane_api/app/lib/planning_tools/teams.py +198 -0
- control_plane_api/app/lib/policy_enforcer_client.py +939 -0
- control_plane_api/app/lib/redis_client.py +436 -0
- control_plane_api/app/lib/supabase.py +71 -0
- control_plane_api/app/lib/temporal_client.py +138 -0
- control_plane_api/app/lib/validation/__init__.py +20 -0
- control_plane_api/app/lib/validation/runtime_validation.py +287 -0
- control_plane_api/app/main.py +128 -0
- control_plane_api/app/middleware/__init__.py +8 -0
- control_plane_api/app/middleware/auth.py +513 -0
- control_plane_api/app/middleware/exception_handler.py +267 -0
- control_plane_api/app/middleware/rate_limiting.py +384 -0
- control_plane_api/app/middleware/request_id.py +202 -0
- control_plane_api/app/models/__init__.py +27 -0
- control_plane_api/app/models/agent.py +79 -0
- control_plane_api/app/models/analytics.py +206 -0
- control_plane_api/app/models/associations.py +81 -0
- control_plane_api/app/models/environment.py +63 -0
- control_plane_api/app/models/execution.py +93 -0
- control_plane_api/app/models/job.py +179 -0
- control_plane_api/app/models/llm_model.py +75 -0
- control_plane_api/app/models/presence.py +49 -0
- control_plane_api/app/models/project.py +47 -0
- control_plane_api/app/models/session.py +38 -0
- control_plane_api/app/models/team.py +66 -0
- control_plane_api/app/models/workflow.py +55 -0
- control_plane_api/app/policies/README.md +121 -0
- control_plane_api/app/policies/approved_users.rego +62 -0
- control_plane_api/app/policies/business_hours.rego +51 -0
- control_plane_api/app/policies/rate_limiting.rego +100 -0
- control_plane_api/app/policies/tool_restrictions.rego +86 -0
- control_plane_api/app/routers/__init__.py +4 -0
- control_plane_api/app/routers/agents.py +364 -0
- control_plane_api/app/routers/agents_v2.py +1260 -0
- control_plane_api/app/routers/analytics.py +1014 -0
- control_plane_api/app/routers/context_manager.py +562 -0
- control_plane_api/app/routers/environment_context.py +270 -0
- control_plane_api/app/routers/environments.py +715 -0
- control_plane_api/app/routers/execution_environment.py +517 -0
- control_plane_api/app/routers/executions.py +1911 -0
- control_plane_api/app/routers/health.py +92 -0
- control_plane_api/app/routers/health_v2.py +326 -0
- control_plane_api/app/routers/integrations.py +274 -0
- control_plane_api/app/routers/jobs.py +1344 -0
- control_plane_api/app/routers/models.py +82 -0
- control_plane_api/app/routers/models_v2.py +361 -0
- control_plane_api/app/routers/policies.py +639 -0
- control_plane_api/app/routers/presence.py +234 -0
- control_plane_api/app/routers/projects.py +902 -0
- control_plane_api/app/routers/runners.py +379 -0
- control_plane_api/app/routers/runtimes.py +172 -0
- control_plane_api/app/routers/secrets.py +155 -0
- control_plane_api/app/routers/skills.py +1001 -0
- control_plane_api/app/routers/skills_definitions.py +140 -0
- control_plane_api/app/routers/task_planning.py +1256 -0
- control_plane_api/app/routers/task_queues.py +654 -0
- control_plane_api/app/routers/team_context.py +270 -0
- control_plane_api/app/routers/teams.py +1400 -0
- control_plane_api/app/routers/worker_queues.py +1545 -0
- control_plane_api/app/routers/workers.py +935 -0
- control_plane_api/app/routers/workflows.py +204 -0
- control_plane_api/app/runtimes/__init__.py +6 -0
- control_plane_api/app/runtimes/validation.py +344 -0
- control_plane_api/app/schemas/job_schemas.py +295 -0
- control_plane_api/app/services/__init__.py +1 -0
- control_plane_api/app/services/agno_service.py +619 -0
- control_plane_api/app/services/litellm_service.py +190 -0
- control_plane_api/app/services/policy_service.py +525 -0
- control_plane_api/app/services/temporal_cloud_provisioning.py +150 -0
- control_plane_api/app/skills/__init__.py +44 -0
- control_plane_api/app/skills/base.py +229 -0
- control_plane_api/app/skills/business_intelligence.py +189 -0
- control_plane_api/app/skills/data_visualization.py +154 -0
- control_plane_api/app/skills/docker.py +104 -0
- control_plane_api/app/skills/file_generation.py +94 -0
- control_plane_api/app/skills/file_system.py +110 -0
- control_plane_api/app/skills/python.py +92 -0
- control_plane_api/app/skills/registry.py +65 -0
- control_plane_api/app/skills/shell.py +102 -0
- control_plane_api/app/skills/workflow_executor.py +469 -0
- control_plane_api/app/utils/workflow_executor.py +354 -0
- control_plane_api/app/workflows/__init__.py +11 -0
- control_plane_api/app/workflows/agent_execution.py +507 -0
- control_plane_api/app/workflows/agent_execution_with_skills.py +222 -0
- control_plane_api/app/workflows/namespace_provisioning.py +326 -0
- control_plane_api/app/workflows/team_execution.py +399 -0
- control_plane_api/scripts/seed_models.py +239 -0
- control_plane_api/worker/__init__.py +0 -0
- control_plane_api/worker/activities/__init__.py +0 -0
- control_plane_api/worker/activities/agent_activities.py +1241 -0
- control_plane_api/worker/activities/approval_activities.py +234 -0
- control_plane_api/worker/activities/runtime_activities.py +388 -0
- control_plane_api/worker/activities/skill_activities.py +267 -0
- control_plane_api/worker/activities/team_activities.py +1217 -0
- control_plane_api/worker/config/__init__.py +31 -0
- control_plane_api/worker/config/worker_config.py +275 -0
- control_plane_api/worker/control_plane_client.py +529 -0
- control_plane_api/worker/examples/analytics_integration_example.py +362 -0
- control_plane_api/worker/models/__init__.py +1 -0
- control_plane_api/worker/models/inputs.py +89 -0
- control_plane_api/worker/runtimes/__init__.py +31 -0
- control_plane_api/worker/runtimes/base.py +789 -0
- control_plane_api/worker/runtimes/claude_code_runtime.py +1443 -0
- control_plane_api/worker/runtimes/default_runtime.py +617 -0
- control_plane_api/worker/runtimes/factory.py +173 -0
- control_plane_api/worker/runtimes/validation.py +93 -0
- control_plane_api/worker/services/__init__.py +1 -0
- control_plane_api/worker/services/agent_executor.py +422 -0
- control_plane_api/worker/services/agent_executor_v2.py +383 -0
- control_plane_api/worker/services/analytics_collector.py +457 -0
- control_plane_api/worker/services/analytics_service.py +464 -0
- control_plane_api/worker/services/approval_tools.py +310 -0
- control_plane_api/worker/services/approval_tools_agno.py +207 -0
- control_plane_api/worker/services/cancellation_manager.py +177 -0
- control_plane_api/worker/services/data_visualization.py +827 -0
- control_plane_api/worker/services/jira_tools.py +257 -0
- control_plane_api/worker/services/runtime_analytics.py +328 -0
- control_plane_api/worker/services/session_service.py +194 -0
- control_plane_api/worker/services/skill_factory.py +175 -0
- control_plane_api/worker/services/team_executor.py +574 -0
- control_plane_api/worker/services/team_executor_v2.py +465 -0
- control_plane_api/worker/services/workflow_executor_tools.py +1418 -0
- control_plane_api/worker/tests/__init__.py +1 -0
- control_plane_api/worker/tests/e2e/__init__.py +0 -0
- control_plane_api/worker/tests/e2e/test_execution_flow.py +571 -0
- control_plane_api/worker/tests/integration/__init__.py +0 -0
- control_plane_api/worker/tests/integration/test_control_plane_integration.py +308 -0
- control_plane_api/worker/tests/unit/__init__.py +0 -0
- control_plane_api/worker/tests/unit/test_control_plane_client.py +401 -0
- control_plane_api/worker/utils/__init__.py +1 -0
- control_plane_api/worker/utils/chunk_batcher.py +305 -0
- control_plane_api/worker/utils/retry_utils.py +60 -0
- control_plane_api/worker/utils/streaming_utils.py +373 -0
- control_plane_api/worker/worker.py +753 -0
- control_plane_api/worker/workflows/__init__.py +0 -0
- control_plane_api/worker/workflows/agent_execution.py +589 -0
- control_plane_api/worker/workflows/team_execution.py +429 -0
- kubiya_control_plane_api-0.3.4.dist-info/METADATA +229 -0
- kubiya_control_plane_api-0.3.4.dist-info/RECORD +182 -0
- kubiya_control_plane_api-0.3.4.dist-info/entry_points.txt +2 -0
- kubiya_control_plane_api-0.3.4.dist-info/top_level.txt +1 -0
- kubiya_control_plane_api-0.1.0.dist-info/METADATA +0 -66
- kubiya_control_plane_api-0.1.0.dist-info/RECORD +0 -5
- kubiya_control_plane_api-0.1.0.dist-info/top_level.txt +0 -1
- {kubiya_control_plane_api-0.1.0.dist-info/licenses → control_plane_api}/LICENSE +0 -0
- {kubiya_control_plane_api-0.1.0.dist-info → kubiya_control_plane_api-0.3.4.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,1911 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Multi-tenant executions router with Supabase.
|
|
3
|
+
|
|
4
|
+
This router handles execution queries for the authenticated organization.
|
|
5
|
+
Uses Supabase directly to avoid SQLAlchemy enum validation issues.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from fastapi import APIRouter, Depends, HTTPException, status, Request
|
|
9
|
+
from fastapi.responses import StreamingResponse
|
|
10
|
+
from typing import List, Optional
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
import structlog
|
|
14
|
+
import asyncio
|
|
15
|
+
import json
|
|
16
|
+
|
|
17
|
+
from control_plane_api.app.middleware.auth import get_current_organization
|
|
18
|
+
from control_plane_api.app.lib.supabase import get_supabase
|
|
19
|
+
from control_plane_api.app.lib.temporal_client import get_temporal_client
|
|
20
|
+
from control_plane_api.app.lib.redis_client import get_redis_client
|
|
21
|
+
from control_plane_api.app.workflows.agent_execution import AgentExecutionWorkflow
|
|
22
|
+
from control_plane_api.app.services.agno_service import agno_service
|
|
23
|
+
|
|
24
|
+
logger = structlog.get_logger()
|
|
25
|
+
|
|
26
|
+
router = APIRouter()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Pydantic schemas
|
|
30
|
+
class ParticipantResponse(BaseModel):
|
|
31
|
+
"""Participant in an execution"""
|
|
32
|
+
id: str
|
|
33
|
+
user_id: str
|
|
34
|
+
user_name: str | None
|
|
35
|
+
user_email: str | None
|
|
36
|
+
user_avatar: str | None
|
|
37
|
+
role: str
|
|
38
|
+
joined_at: str
|
|
39
|
+
last_active_at: str
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ExecutionResponse(BaseModel):
|
|
43
|
+
id: str
|
|
44
|
+
organization_id: str
|
|
45
|
+
execution_type: str
|
|
46
|
+
entity_id: str
|
|
47
|
+
entity_name: str | None
|
|
48
|
+
prompt: str
|
|
49
|
+
system_prompt: str | None
|
|
50
|
+
status: str
|
|
51
|
+
response: str | None
|
|
52
|
+
error_message: str | None
|
|
53
|
+
usage: dict
|
|
54
|
+
execution_metadata: dict
|
|
55
|
+
runner_name: str | None
|
|
56
|
+
user_id: str | None
|
|
57
|
+
user_name: str | None
|
|
58
|
+
user_email: str | None
|
|
59
|
+
user_avatar: str | None
|
|
60
|
+
created_at: str
|
|
61
|
+
started_at: str | None
|
|
62
|
+
completed_at: str | None
|
|
63
|
+
updated_at: str
|
|
64
|
+
participants: List[ParticipantResponse] = Field(default_factory=list)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@router.get("", response_model=List[ExecutionResponse])
|
|
68
|
+
async def list_executions(
|
|
69
|
+
request: Request,
|
|
70
|
+
skip: int = 0,
|
|
71
|
+
limit: int = 100,
|
|
72
|
+
status_filter: str | None = None,
|
|
73
|
+
execution_type: str | None = None,
|
|
74
|
+
organization: dict = Depends(get_current_organization),
|
|
75
|
+
):
|
|
76
|
+
"""List all executions for the organization with optional filtering"""
|
|
77
|
+
try:
|
|
78
|
+
client = get_supabase()
|
|
79
|
+
|
|
80
|
+
# Query executions for this organization with participants
|
|
81
|
+
query = client.table("executions").select("*, execution_participants(*)").eq("organization_id", organization["id"])
|
|
82
|
+
|
|
83
|
+
if status_filter:
|
|
84
|
+
query = query.eq("status", status_filter.lower()) # Normalize to lowercase
|
|
85
|
+
if execution_type:
|
|
86
|
+
query = query.eq("execution_type", execution_type.upper())
|
|
87
|
+
|
|
88
|
+
query = query.order("created_at", desc=True).range(skip, skip + limit - 1)
|
|
89
|
+
|
|
90
|
+
result = query.execute()
|
|
91
|
+
|
|
92
|
+
if not result or not result.data:
|
|
93
|
+
logger.info("no_executions_found", org_id=organization["id"])
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
executions = []
|
|
97
|
+
for execution in result.data:
|
|
98
|
+
try:
|
|
99
|
+
# Parse participants
|
|
100
|
+
participants_data = execution.get("execution_participants", [])
|
|
101
|
+
participants = []
|
|
102
|
+
for p in participants_data:
|
|
103
|
+
try:
|
|
104
|
+
participants.append(ParticipantResponse(
|
|
105
|
+
id=p["id"],
|
|
106
|
+
user_id=p["user_id"],
|
|
107
|
+
user_name=p.get("user_name"),
|
|
108
|
+
user_email=p.get("user_email"),
|
|
109
|
+
user_avatar=p.get("user_avatar"),
|
|
110
|
+
role=p["role"],
|
|
111
|
+
joined_at=p["joined_at"],
|
|
112
|
+
last_active_at=p["last_active_at"],
|
|
113
|
+
))
|
|
114
|
+
except Exception as participant_error:
|
|
115
|
+
logger.warning("failed_to_parse_participant", error=str(participant_error), execution_id=execution.get("id"))
|
|
116
|
+
# Skip invalid participant, continue with others
|
|
117
|
+
|
|
118
|
+
executions.append(
|
|
119
|
+
ExecutionResponse(
|
|
120
|
+
id=execution["id"],
|
|
121
|
+
organization_id=execution["organization_id"],
|
|
122
|
+
execution_type=execution["execution_type"],
|
|
123
|
+
entity_id=execution["entity_id"],
|
|
124
|
+
entity_name=execution.get("entity_name"),
|
|
125
|
+
prompt=execution.get("prompt", ""),
|
|
126
|
+
system_prompt=execution.get("system_prompt"),
|
|
127
|
+
status=execution["status"],
|
|
128
|
+
response=execution.get("response"),
|
|
129
|
+
error_message=execution.get("error_message"),
|
|
130
|
+
usage=execution.get("usage", {}),
|
|
131
|
+
execution_metadata=execution.get("execution_metadata", {}),
|
|
132
|
+
runner_name=execution.get("runner_name"),
|
|
133
|
+
user_id=execution.get("user_id"),
|
|
134
|
+
user_name=execution.get("user_name"),
|
|
135
|
+
user_email=execution.get("user_email"),
|
|
136
|
+
user_avatar=execution.get("user_avatar"),
|
|
137
|
+
created_at=execution["created_at"],
|
|
138
|
+
started_at=execution.get("started_at"),
|
|
139
|
+
completed_at=execution.get("completed_at"),
|
|
140
|
+
updated_at=execution["updated_at"],
|
|
141
|
+
participants=participants,
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
except Exception as execution_error:
|
|
145
|
+
logger.error("failed_to_parse_execution", error=str(execution_error), execution_id=execution.get("id"))
|
|
146
|
+
# Skip invalid execution, continue with others
|
|
147
|
+
|
|
148
|
+
logger.info(
|
|
149
|
+
"executions_listed_successfully",
|
|
150
|
+
count=len(executions),
|
|
151
|
+
org_id=organization["id"],
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return executions
|
|
155
|
+
|
|
156
|
+
except HTTPException:
|
|
157
|
+
raise
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(
|
|
160
|
+
"executions_list_failed",
|
|
161
|
+
error=str(e),
|
|
162
|
+
error_type=type(e).__name__,
|
|
163
|
+
org_id=organization["id"]
|
|
164
|
+
)
|
|
165
|
+
raise HTTPException(
|
|
166
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
167
|
+
detail=f"Failed to list executions: {str(e)}"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@router.get("/{execution_id}", response_model=ExecutionResponse)
|
|
172
|
+
async def get_execution(
|
|
173
|
+
execution_id: str,
|
|
174
|
+
request: Request,
|
|
175
|
+
organization: dict = Depends(get_current_organization),
|
|
176
|
+
):
|
|
177
|
+
"""Get a specific execution by ID"""
|
|
178
|
+
try:
|
|
179
|
+
client = get_supabase()
|
|
180
|
+
|
|
181
|
+
result = (
|
|
182
|
+
client.table("executions")
|
|
183
|
+
.select("*")
|
|
184
|
+
.eq("id", execution_id)
|
|
185
|
+
.eq("organization_id", organization["id"])
|
|
186
|
+
.single()
|
|
187
|
+
.execute()
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
if not result.data:
|
|
191
|
+
raise HTTPException(status_code=404, detail="Execution not found")
|
|
192
|
+
|
|
193
|
+
execution_data = result.data
|
|
194
|
+
|
|
195
|
+
return ExecutionResponse(
|
|
196
|
+
id=execution_data["id"],
|
|
197
|
+
organization_id=execution_data["organization_id"],
|
|
198
|
+
execution_type=execution_data["execution_type"],
|
|
199
|
+
entity_id=execution_data["entity_id"],
|
|
200
|
+
entity_name=execution_data.get("entity_name"),
|
|
201
|
+
prompt=execution_data.get("prompt", ""),
|
|
202
|
+
system_prompt=execution_data.get("system_prompt"),
|
|
203
|
+
status=execution_data["status"],
|
|
204
|
+
response=execution_data.get("response"),
|
|
205
|
+
error_message=execution_data.get("error_message"),
|
|
206
|
+
usage=execution_data.get("usage", {}),
|
|
207
|
+
execution_metadata=execution_data.get("execution_metadata", {}),
|
|
208
|
+
runner_name=execution_data.get("runner_name"),
|
|
209
|
+
user_id=execution_data.get("user_id"),
|
|
210
|
+
user_name=execution_data.get("user_name"),
|
|
211
|
+
user_email=execution_data.get("user_email"),
|
|
212
|
+
user_avatar=execution_data.get("user_avatar"),
|
|
213
|
+
created_at=execution_data["created_at"],
|
|
214
|
+
started_at=execution_data.get("started_at"),
|
|
215
|
+
completed_at=execution_data.get("completed_at"),
|
|
216
|
+
updated_at=execution_data["updated_at"],
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
except HTTPException:
|
|
220
|
+
raise
|
|
221
|
+
except Exception as e:
|
|
222
|
+
logger.error("execution_get_failed", error=str(e), execution_id=execution_id)
|
|
223
|
+
raise HTTPException(
|
|
224
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
225
|
+
detail=f"Failed to get execution: {str(e)}"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
@router.delete("/{execution_id}", status_code=status.HTTP_204_NO_CONTENT)
|
|
230
|
+
async def delete_execution(
|
|
231
|
+
execution_id: str,
|
|
232
|
+
request: Request,
|
|
233
|
+
organization: dict = Depends(get_current_organization),
|
|
234
|
+
):
|
|
235
|
+
"""Delete an execution"""
|
|
236
|
+
try:
|
|
237
|
+
client = get_supabase()
|
|
238
|
+
|
|
239
|
+
result = (
|
|
240
|
+
client.table("executions")
|
|
241
|
+
.delete()
|
|
242
|
+
.eq("id", execution_id)
|
|
243
|
+
.eq("organization_id", organization["id"])
|
|
244
|
+
.execute()
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if not result.data:
|
|
248
|
+
raise HTTPException(status_code=404, detail="Execution not found")
|
|
249
|
+
|
|
250
|
+
logger.info("execution_deleted", execution_id=execution_id, org_id=organization["id"])
|
|
251
|
+
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
except HTTPException:
|
|
255
|
+
raise
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.error("execution_delete_failed", error=str(e), execution_id=execution_id)
|
|
258
|
+
raise HTTPException(
|
|
259
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
260
|
+
detail=f"Failed to delete execution: {str(e)}"
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class ExecutionUpdate(BaseModel):
|
|
265
|
+
"""Update execution fields - used by workers to update execution status"""
|
|
266
|
+
status: str | None = None
|
|
267
|
+
started_at: str | None = None
|
|
268
|
+
completed_at: str | None = None
|
|
269
|
+
response: str | None = None
|
|
270
|
+
error_message: str | None = None
|
|
271
|
+
usage: dict | None = None
|
|
272
|
+
execution_metadata: dict | None = None
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
@router.patch("/{execution_id}", response_model=ExecutionResponse)
|
|
276
|
+
async def update_execution(
|
|
277
|
+
execution_id: str,
|
|
278
|
+
execution_update: ExecutionUpdate,
|
|
279
|
+
request: Request,
|
|
280
|
+
organization: dict = Depends(get_current_organization),
|
|
281
|
+
):
|
|
282
|
+
"""
|
|
283
|
+
Update execution status and results.
|
|
284
|
+
|
|
285
|
+
This endpoint is primarily used by workers to update execution status,
|
|
286
|
+
results, usage metrics, and metadata during execution.
|
|
287
|
+
"""
|
|
288
|
+
try:
|
|
289
|
+
client = get_supabase()
|
|
290
|
+
|
|
291
|
+
# Build update dict - only include provided fields
|
|
292
|
+
update_data = {}
|
|
293
|
+
|
|
294
|
+
if execution_update.status is not None:
|
|
295
|
+
update_data["status"] = execution_update.status.lower() # Normalize to lowercase
|
|
296
|
+
|
|
297
|
+
if execution_update.started_at is not None:
|
|
298
|
+
update_data["started_at"] = execution_update.started_at
|
|
299
|
+
|
|
300
|
+
if execution_update.completed_at is not None:
|
|
301
|
+
update_data["completed_at"] = execution_update.completed_at
|
|
302
|
+
|
|
303
|
+
if execution_update.response is not None:
|
|
304
|
+
update_data["response"] = execution_update.response
|
|
305
|
+
|
|
306
|
+
if execution_update.error_message is not None:
|
|
307
|
+
update_data["error_message"] = execution_update.error_message
|
|
308
|
+
|
|
309
|
+
if execution_update.usage is not None:
|
|
310
|
+
update_data["usage"] = execution_update.usage
|
|
311
|
+
|
|
312
|
+
if execution_update.execution_metadata is not None:
|
|
313
|
+
update_data["execution_metadata"] = execution_update.execution_metadata
|
|
314
|
+
|
|
315
|
+
# Always update updated_at
|
|
316
|
+
update_data["updated_at"] = datetime.utcnow().isoformat()
|
|
317
|
+
|
|
318
|
+
# Update execution
|
|
319
|
+
result = (
|
|
320
|
+
client.table("executions")
|
|
321
|
+
.update(update_data)
|
|
322
|
+
.eq("id", execution_id)
|
|
323
|
+
.eq("organization_id", organization["id"])
|
|
324
|
+
.execute()
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
if not result.data:
|
|
328
|
+
raise HTTPException(status_code=404, detail="Execution not found")
|
|
329
|
+
|
|
330
|
+
execution_data = result.data[0]
|
|
331
|
+
|
|
332
|
+
logger.info(
|
|
333
|
+
"execution_updated",
|
|
334
|
+
execution_id=execution_id,
|
|
335
|
+
org_id=organization["id"],
|
|
336
|
+
fields_updated=list(update_data.keys()),
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
return ExecutionResponse(
|
|
340
|
+
id=execution_data["id"],
|
|
341
|
+
organization_id=execution_data["organization_id"],
|
|
342
|
+
execution_type=execution_data["execution_type"],
|
|
343
|
+
entity_id=execution_data["entity_id"],
|
|
344
|
+
entity_name=execution_data.get("entity_name"),
|
|
345
|
+
prompt=execution_data.get("prompt", ""),
|
|
346
|
+
system_prompt=execution_data.get("system_prompt"),
|
|
347
|
+
status=execution_data["status"],
|
|
348
|
+
response=execution_data.get("response"),
|
|
349
|
+
error_message=execution_data.get("error_message"),
|
|
350
|
+
usage=execution_data.get("usage", {}),
|
|
351
|
+
execution_metadata=execution_data.get("execution_metadata", {}),
|
|
352
|
+
runner_name=execution_data.get("runner_name"),
|
|
353
|
+
user_id=execution_data.get("user_id"),
|
|
354
|
+
user_name=execution_data.get("user_name"),
|
|
355
|
+
user_email=execution_data.get("user_email"),
|
|
356
|
+
user_avatar=execution_data.get("user_avatar"),
|
|
357
|
+
created_at=execution_data["created_at"],
|
|
358
|
+
started_at=execution_data.get("started_at"),
|
|
359
|
+
completed_at=execution_data.get("completed_at"),
|
|
360
|
+
updated_at=execution_data["updated_at"],
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
except HTTPException:
|
|
364
|
+
raise
|
|
365
|
+
except Exception as e:
|
|
366
|
+
logger.error("execution_update_failed", error=str(e), execution_id=execution_id)
|
|
367
|
+
raise HTTPException(
|
|
368
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
369
|
+
detail=f"Failed to update execution: {str(e)}"
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
class SendMessageRequest(BaseModel):
|
|
374
|
+
"""Request to send a message to a running execution"""
|
|
375
|
+
message: str
|
|
376
|
+
role: str = "user" # user, system, etc.
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
@router.post("/{execution_id}/message", status_code=status.HTTP_202_ACCEPTED)
|
|
380
|
+
async def send_message_to_execution(
|
|
381
|
+
execution_id: str,
|
|
382
|
+
request_body: SendMessageRequest,
|
|
383
|
+
request: Request,
|
|
384
|
+
organization: dict = Depends(get_current_organization),
|
|
385
|
+
):
|
|
386
|
+
"""
|
|
387
|
+
Send a followup message to a running execution using Temporal signals.
|
|
388
|
+
|
|
389
|
+
This sends a signal to the Temporal workflow, adding the message to the conversation.
|
|
390
|
+
The workflow will process the message and respond accordingly.
|
|
391
|
+
"""
|
|
392
|
+
try:
|
|
393
|
+
# Get Temporal client
|
|
394
|
+
temporal_client = await get_temporal_client()
|
|
395
|
+
|
|
396
|
+
# Verify the execution belongs to this organization and get execution type
|
|
397
|
+
client = get_supabase()
|
|
398
|
+
result = (
|
|
399
|
+
client.table("executions")
|
|
400
|
+
.select("id, organization_id, status, execution_type")
|
|
401
|
+
.eq("id", execution_id)
|
|
402
|
+
.eq("organization_id", organization["id"])
|
|
403
|
+
.single()
|
|
404
|
+
.execute()
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
if not result.data:
|
|
408
|
+
raise HTTPException(status_code=404, detail="Execution not found")
|
|
409
|
+
|
|
410
|
+
# Construct workflow ID based on execution type
|
|
411
|
+
execution_type = result.data.get("execution_type", "AGENT")
|
|
412
|
+
if execution_type == "TEAM":
|
|
413
|
+
workflow_id = f"team-execution-{execution_id}"
|
|
414
|
+
else:
|
|
415
|
+
workflow_id = f"agent-execution-{execution_id}"
|
|
416
|
+
|
|
417
|
+
workflow_handle = temporal_client.get_workflow_handle(workflow_id)
|
|
418
|
+
|
|
419
|
+
# Import ChatMessage from workflow
|
|
420
|
+
from control_plane_api.app.workflows.agent_execution import ChatMessage
|
|
421
|
+
from datetime import datetime, timezone
|
|
422
|
+
|
|
423
|
+
# Create the message with user attribution from JWT token
|
|
424
|
+
message = ChatMessage(
|
|
425
|
+
role=request_body.role,
|
|
426
|
+
content=request_body.message,
|
|
427
|
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
428
|
+
user_id=organization.get("user_id"),
|
|
429
|
+
user_name=organization.get("user_name"),
|
|
430
|
+
user_email=organization.get("user_email"),
|
|
431
|
+
user_avatar=organization.get("user_avatar"), # Now available from JWT via auth middleware
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
# Send signal to workflow
|
|
435
|
+
await workflow_handle.signal(AgentExecutionWorkflow.add_message, message)
|
|
436
|
+
|
|
437
|
+
# Add user as participant if not already added (multiplayer support)
|
|
438
|
+
user_id = organization.get("user_id")
|
|
439
|
+
if user_id:
|
|
440
|
+
try:
|
|
441
|
+
# Check if participant already exists
|
|
442
|
+
existing = (
|
|
443
|
+
client.table("execution_participants")
|
|
444
|
+
.select("id")
|
|
445
|
+
.eq("execution_id", execution_id)
|
|
446
|
+
.eq("user_id", user_id)
|
|
447
|
+
.execute()
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
if not existing.data or len(existing.data) == 0:
|
|
451
|
+
# Add as new participant (collaborator role)
|
|
452
|
+
import uuid
|
|
453
|
+
client.table("execution_participants").insert({
|
|
454
|
+
"id": str(uuid.uuid4()),
|
|
455
|
+
"execution_id": execution_id,
|
|
456
|
+
"organization_id": organization["id"],
|
|
457
|
+
"user_id": user_id,
|
|
458
|
+
"user_name": organization.get("user_name"),
|
|
459
|
+
"user_email": organization.get("user_email"),
|
|
460
|
+
"user_avatar": organization.get("user_avatar"),
|
|
461
|
+
"role": "collaborator",
|
|
462
|
+
}).execute()
|
|
463
|
+
logger.info(
|
|
464
|
+
"participant_added",
|
|
465
|
+
execution_id=execution_id,
|
|
466
|
+
user_id=user_id,
|
|
467
|
+
)
|
|
468
|
+
else:
|
|
469
|
+
# Update last_active_at for existing participant
|
|
470
|
+
client.table("execution_participants").update({
|
|
471
|
+
"last_active_at": datetime.now(timezone.utc).isoformat(),
|
|
472
|
+
}).eq("execution_id", execution_id).eq("user_id", user_id).execute()
|
|
473
|
+
except Exception as participant_error:
|
|
474
|
+
logger.warning(
|
|
475
|
+
"failed_to_add_participant",
|
|
476
|
+
error=str(participant_error),
|
|
477
|
+
execution_id=execution_id,
|
|
478
|
+
)
|
|
479
|
+
# Don't fail the whole request if participant tracking fails
|
|
480
|
+
|
|
481
|
+
logger.info(
|
|
482
|
+
"message_sent_to_execution",
|
|
483
|
+
execution_id=execution_id,
|
|
484
|
+
org_id=organization["id"],
|
|
485
|
+
role=request_body.role,
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
return {
|
|
489
|
+
"success": True,
|
|
490
|
+
"execution_id": execution_id,
|
|
491
|
+
"message": "Message sent to workflow",
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
except HTTPException:
|
|
495
|
+
raise
|
|
496
|
+
except Exception as e:
|
|
497
|
+
logger.error(
|
|
498
|
+
"send_message_failed",
|
|
499
|
+
error=str(e),
|
|
500
|
+
execution_id=execution_id,
|
|
501
|
+
)
|
|
502
|
+
raise HTTPException(
|
|
503
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
504
|
+
detail=f"Failed to send message: {str(e)}"
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
@router.post("/{execution_id}/pause")
|
|
509
|
+
async def pause_execution(
|
|
510
|
+
execution_id: str,
|
|
511
|
+
request: Request,
|
|
512
|
+
organization: dict = Depends(get_current_organization),
|
|
513
|
+
):
|
|
514
|
+
"""
|
|
515
|
+
Pause an active execution by sending a signal to the Temporal workflow.
|
|
516
|
+
|
|
517
|
+
This is triggered when the user clicks the PAUSE button in the UI.
|
|
518
|
+
The workflow will stop processing but remain active and can be resumed.
|
|
519
|
+
"""
|
|
520
|
+
try:
|
|
521
|
+
logger.info(
|
|
522
|
+
"pause_execution_requested",
|
|
523
|
+
execution_id=execution_id,
|
|
524
|
+
org_id=organization["id"]
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
# Get execution from Supabase
|
|
528
|
+
client = get_supabase()
|
|
529
|
+
result = (
|
|
530
|
+
client.table("executions")
|
|
531
|
+
.select("id, status, execution_type")
|
|
532
|
+
.eq("id", execution_id)
|
|
533
|
+
.eq("organization_id", organization["id"])
|
|
534
|
+
.single()
|
|
535
|
+
.execute()
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
if not result.data:
|
|
539
|
+
raise HTTPException(
|
|
540
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
541
|
+
detail="Execution not found"
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
execution = result.data
|
|
545
|
+
current_status = execution["status"]
|
|
546
|
+
|
|
547
|
+
# Check if execution can be paused
|
|
548
|
+
if current_status not in ["running", "waiting_for_input"]:
|
|
549
|
+
logger.warning(
|
|
550
|
+
"pause_execution_invalid_status",
|
|
551
|
+
execution_id=execution_id,
|
|
552
|
+
status=current_status
|
|
553
|
+
)
|
|
554
|
+
return {
|
|
555
|
+
"success": False,
|
|
556
|
+
"error": f"Execution cannot be paused (status: {current_status})",
|
|
557
|
+
"execution_id": execution_id,
|
|
558
|
+
"status": current_status,
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
# Get Temporal client
|
|
562
|
+
temporal_client = await get_temporal_client()
|
|
563
|
+
|
|
564
|
+
# Determine workflow ID based on execution type
|
|
565
|
+
execution_type = execution.get("execution_type", "AGENT")
|
|
566
|
+
workflow_id = f"team-execution-{execution_id}" if execution_type == "TEAM" else f"agent-execution-{execution_id}"
|
|
567
|
+
|
|
568
|
+
workflow_handle = temporal_client.get_workflow_handle(workflow_id)
|
|
569
|
+
|
|
570
|
+
# Send pause signal to workflow
|
|
571
|
+
await workflow_handle.signal(AgentExecutionWorkflow.pause_execution)
|
|
572
|
+
|
|
573
|
+
# Update execution status to paused in Supabase
|
|
574
|
+
(
|
|
575
|
+
client.table("executions")
|
|
576
|
+
.update({
|
|
577
|
+
"status": "paused",
|
|
578
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
579
|
+
})
|
|
580
|
+
.eq("id", execution_id)
|
|
581
|
+
.eq("organization_id", organization["id"])
|
|
582
|
+
.execute()
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
# Emit system message to Redis for UI display
|
|
586
|
+
redis_client = get_redis_client()
|
|
587
|
+
if redis_client:
|
|
588
|
+
try:
|
|
589
|
+
import time
|
|
590
|
+
user_name = organization.get("user_name", "User")
|
|
591
|
+
current_timestamp = datetime.utcnow().isoformat()
|
|
592
|
+
message_id = f"{execution_id}_pause_{int(time.time() * 1000000)}"
|
|
593
|
+
|
|
594
|
+
# Create message event - format matches what streaming endpoint expects
|
|
595
|
+
pause_message_event = {
|
|
596
|
+
"event_type": "message",
|
|
597
|
+
"data": {
|
|
598
|
+
"role": "system",
|
|
599
|
+
"content": f"⏸️ Execution paused by {user_name}",
|
|
600
|
+
"timestamp": current_timestamp,
|
|
601
|
+
"message_id": message_id,
|
|
602
|
+
},
|
|
603
|
+
"timestamp": current_timestamp,
|
|
604
|
+
"execution_id": execution_id,
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
redis_key = f"execution:{execution_id}:events"
|
|
608
|
+
await redis_client.lpush(redis_key, json.dumps(pause_message_event))
|
|
609
|
+
await redis_client.ltrim(redis_key, 0, 999)
|
|
610
|
+
await redis_client.expire(redis_key, 3600)
|
|
611
|
+
|
|
612
|
+
# Also update status event
|
|
613
|
+
status_event = {
|
|
614
|
+
"event_type": "status",
|
|
615
|
+
"data": {"status": "paused", "execution_id": execution_id},
|
|
616
|
+
"timestamp": current_timestamp,
|
|
617
|
+
"execution_id": execution_id,
|
|
618
|
+
}
|
|
619
|
+
await redis_client.lpush(redis_key, json.dumps(status_event))
|
|
620
|
+
|
|
621
|
+
logger.debug("pause_event_published_to_redis", execution_id=execution_id)
|
|
622
|
+
except Exception as redis_error:
|
|
623
|
+
logger.warning("failed_to_publish_pause_event", error=str(redis_error), execution_id=execution_id)
|
|
624
|
+
|
|
625
|
+
logger.info(
|
|
626
|
+
"execution_paused_successfully",
|
|
627
|
+
execution_id=execution_id,
|
|
628
|
+
workflow_id=workflow_id
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
return {
|
|
632
|
+
"success": True,
|
|
633
|
+
"execution_id": execution_id,
|
|
634
|
+
"workflow_id": workflow_id,
|
|
635
|
+
"message": "Execution paused successfully",
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
except HTTPException:
|
|
639
|
+
raise
|
|
640
|
+
except Exception as e:
|
|
641
|
+
logger.error(
|
|
642
|
+
"pause_execution_error",
|
|
643
|
+
execution_id=execution_id,
|
|
644
|
+
error=str(e)
|
|
645
|
+
)
|
|
646
|
+
raise HTTPException(
|
|
647
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
648
|
+
detail=f"Failed to pause execution: {str(e)}"
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
@router.post("/{execution_id}/resume")
|
|
653
|
+
async def resume_execution(
|
|
654
|
+
execution_id: str,
|
|
655
|
+
request: Request,
|
|
656
|
+
organization: dict = Depends(get_current_organization),
|
|
657
|
+
):
|
|
658
|
+
"""
|
|
659
|
+
Resume a paused execution by sending a signal to the Temporal workflow.
|
|
660
|
+
|
|
661
|
+
This is triggered when the user clicks the RESUME button in the UI.
|
|
662
|
+
The workflow will continue processing from where it was paused.
|
|
663
|
+
"""
|
|
664
|
+
try:
|
|
665
|
+
logger.info(
|
|
666
|
+
"resume_execution_requested",
|
|
667
|
+
execution_id=execution_id,
|
|
668
|
+
org_id=organization["id"]
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
# Get execution from Supabase
|
|
672
|
+
client = get_supabase()
|
|
673
|
+
result = (
|
|
674
|
+
client.table("executions")
|
|
675
|
+
.select("id, status, execution_type")
|
|
676
|
+
.eq("id", execution_id)
|
|
677
|
+
.eq("organization_id", organization["id"])
|
|
678
|
+
.single()
|
|
679
|
+
.execute()
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
if not result.data:
|
|
683
|
+
raise HTTPException(
|
|
684
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
685
|
+
detail="Execution not found"
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
execution = result.data
|
|
689
|
+
current_status = execution["status"]
|
|
690
|
+
|
|
691
|
+
# Check if execution is paused
|
|
692
|
+
if current_status != "paused":
|
|
693
|
+
logger.warning(
|
|
694
|
+
"resume_execution_not_paused",
|
|
695
|
+
execution_id=execution_id,
|
|
696
|
+
status=current_status
|
|
697
|
+
)
|
|
698
|
+
return {
|
|
699
|
+
"success": False,
|
|
700
|
+
"error": f"Execution is not paused (status: {current_status})",
|
|
701
|
+
"execution_id": execution_id,
|
|
702
|
+
"status": current_status,
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
# Get Temporal client
|
|
706
|
+
temporal_client = await get_temporal_client()
|
|
707
|
+
|
|
708
|
+
# Determine workflow ID based on execution type
|
|
709
|
+
execution_type = execution.get("execution_type", "AGENT")
|
|
710
|
+
workflow_id = f"team-execution-{execution_id}" if execution_type == "TEAM" else f"agent-execution-{execution_id}"
|
|
711
|
+
|
|
712
|
+
workflow_handle = temporal_client.get_workflow_handle(workflow_id)
|
|
713
|
+
|
|
714
|
+
# Send resume signal to workflow
|
|
715
|
+
await workflow_handle.signal(AgentExecutionWorkflow.resume_execution)
|
|
716
|
+
|
|
717
|
+
# Update execution status back to running/waiting in Supabase
|
|
718
|
+
# The workflow will determine the correct status
|
|
719
|
+
(
|
|
720
|
+
client.table("executions")
|
|
721
|
+
.update({
|
|
722
|
+
"status": "running", # Workflow will update to correct status
|
|
723
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
724
|
+
})
|
|
725
|
+
.eq("id", execution_id)
|
|
726
|
+
.eq("organization_id", organization["id"])
|
|
727
|
+
.execute()
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Emit system message to Redis for UI display
|
|
731
|
+
redis_client = get_redis_client()
|
|
732
|
+
if redis_client:
|
|
733
|
+
try:
|
|
734
|
+
import time
|
|
735
|
+
user_name = organization.get("user_name", "User")
|
|
736
|
+
current_timestamp = datetime.utcnow().isoformat()
|
|
737
|
+
message_id = f"{execution_id}_resume_{int(time.time() * 1000000)}"
|
|
738
|
+
|
|
739
|
+
# Create message event - format matches what streaming endpoint expects
|
|
740
|
+
resume_message_event = {
|
|
741
|
+
"event_type": "message",
|
|
742
|
+
"data": {
|
|
743
|
+
"role": "system",
|
|
744
|
+
"content": f"▶️ Execution resumed by {user_name}",
|
|
745
|
+
"timestamp": current_timestamp,
|
|
746
|
+
"message_id": message_id,
|
|
747
|
+
},
|
|
748
|
+
"timestamp": current_timestamp,
|
|
749
|
+
"execution_id": execution_id,
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
redis_key = f"execution:{execution_id}:events"
|
|
753
|
+
await redis_client.lpush(redis_key, json.dumps(resume_message_event))
|
|
754
|
+
await redis_client.ltrim(redis_key, 0, 999)
|
|
755
|
+
await redis_client.expire(redis_key, 3600)
|
|
756
|
+
|
|
757
|
+
# Also update status event
|
|
758
|
+
status_event = {
|
|
759
|
+
"event_type": "status",
|
|
760
|
+
"data": {"status": "running", "execution_id": execution_id},
|
|
761
|
+
"timestamp": current_timestamp,
|
|
762
|
+
"execution_id": execution_id,
|
|
763
|
+
}
|
|
764
|
+
await redis_client.lpush(redis_key, json.dumps(status_event))
|
|
765
|
+
|
|
766
|
+
logger.debug("resume_event_published_to_redis", execution_id=execution_id)
|
|
767
|
+
except Exception as redis_error:
|
|
768
|
+
logger.warning("failed_to_publish_resume_event", error=str(redis_error), execution_id=execution_id)
|
|
769
|
+
|
|
770
|
+
logger.info(
|
|
771
|
+
"execution_resumed_successfully",
|
|
772
|
+
execution_id=execution_id,
|
|
773
|
+
workflow_id=workflow_id
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
return {
|
|
777
|
+
"success": True,
|
|
778
|
+
"execution_id": execution_id,
|
|
779
|
+
"workflow_id": workflow_id,
|
|
780
|
+
"message": "Execution resumed successfully",
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
except HTTPException:
|
|
784
|
+
raise
|
|
785
|
+
except Exception as e:
|
|
786
|
+
logger.error(
|
|
787
|
+
"resume_execution_error",
|
|
788
|
+
execution_id=execution_id,
|
|
789
|
+
error=str(e)
|
|
790
|
+
)
|
|
791
|
+
raise HTTPException(
|
|
792
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
793
|
+
detail=f"Failed to resume execution: {str(e)}"
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
@router.post("/{execution_id}/cancel")
|
|
798
|
+
async def cancel_execution(
|
|
799
|
+
execution_id: str,
|
|
800
|
+
request: Request,
|
|
801
|
+
organization: dict = Depends(get_current_organization),
|
|
802
|
+
):
|
|
803
|
+
"""
|
|
804
|
+
Cancel an active execution by calling Temporal's workflow cancellation.
|
|
805
|
+
|
|
806
|
+
This is triggered when the user clicks the STOP button in the UI.
|
|
807
|
+
It uses Temporal's built-in cancellation which is fast and returns immediately.
|
|
808
|
+
"""
|
|
809
|
+
try:
|
|
810
|
+
from temporalio.client import WorkflowHandle
|
|
811
|
+
|
|
812
|
+
logger.info(
|
|
813
|
+
"cancel_execution_requested",
|
|
814
|
+
execution_id=execution_id,
|
|
815
|
+
org_id=organization["id"]
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
# Get execution from Supabase
|
|
819
|
+
client = get_supabase()
|
|
820
|
+
result = (
|
|
821
|
+
client.table("executions")
|
|
822
|
+
.select("id, status, execution_type")
|
|
823
|
+
.eq("id", execution_id)
|
|
824
|
+
.eq("organization_id", organization["id"])
|
|
825
|
+
.single()
|
|
826
|
+
.execute()
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
if not result.data:
|
|
830
|
+
raise HTTPException(
|
|
831
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
832
|
+
detail="Execution not found"
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
execution = result.data
|
|
836
|
+
current_status = execution["status"]
|
|
837
|
+
|
|
838
|
+
# Check if execution is still running
|
|
839
|
+
if current_status not in ["running", "waiting_for_input"]:
|
|
840
|
+
logger.warning(
|
|
841
|
+
"cancel_execution_not_running",
|
|
842
|
+
execution_id=execution_id,
|
|
843
|
+
status=current_status
|
|
844
|
+
)
|
|
845
|
+
return {
|
|
846
|
+
"success": False,
|
|
847
|
+
"error": f"Execution is not running (status: {current_status})",
|
|
848
|
+
"execution_id": execution_id,
|
|
849
|
+
"status": current_status,
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
# Get Temporal client
|
|
853
|
+
temporal_client = await get_temporal_client()
|
|
854
|
+
|
|
855
|
+
# Determine workflow ID based on execution type
|
|
856
|
+
execution_type = execution.get("execution_type", "AGENT")
|
|
857
|
+
workflow_id = f"team-execution-{execution_id}" if execution_type == "TEAM" else f"agent-execution-{execution_id}"
|
|
858
|
+
|
|
859
|
+
workflow_handle: WorkflowHandle = temporal_client.get_workflow_handle(
|
|
860
|
+
workflow_id=workflow_id
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
# Use Temporal's built-in workflow cancellation
|
|
864
|
+
# This is fast and returns immediately
|
|
865
|
+
try:
|
|
866
|
+
# Send cancel signal to the workflow
|
|
867
|
+
# This returns immediately - it doesn't wait for the workflow to finish
|
|
868
|
+
await workflow_handle.cancel()
|
|
869
|
+
|
|
870
|
+
# Update execution status to cancelled in Supabase
|
|
871
|
+
update_result = (
|
|
872
|
+
client.table("executions")
|
|
873
|
+
.update({
|
|
874
|
+
"status": "cancelled",
|
|
875
|
+
"completed_at": datetime.utcnow().isoformat(),
|
|
876
|
+
"error_message": "Cancelled by user",
|
|
877
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
878
|
+
})
|
|
879
|
+
.eq("id", execution_id)
|
|
880
|
+
.eq("organization_id", organization["id"])
|
|
881
|
+
.execute()
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
# Emit system message to Redis for UI display
|
|
885
|
+
redis_client = get_redis_client()
|
|
886
|
+
if redis_client:
|
|
887
|
+
try:
|
|
888
|
+
import time
|
|
889
|
+
user_name = organization.get("user_name", "User")
|
|
890
|
+
current_timestamp = datetime.utcnow().isoformat()
|
|
891
|
+
message_id = f"{execution_id}_cancel_{int(time.time() * 1000000)}"
|
|
892
|
+
|
|
893
|
+
# Create message event - format matches what streaming endpoint expects
|
|
894
|
+
cancel_message_event = {
|
|
895
|
+
"event_type": "message",
|
|
896
|
+
"data": {
|
|
897
|
+
"role": "system",
|
|
898
|
+
"content": f"🛑 Execution stopped by {user_name}",
|
|
899
|
+
"timestamp": current_timestamp,
|
|
900
|
+
"message_id": message_id,
|
|
901
|
+
},
|
|
902
|
+
"timestamp": current_timestamp,
|
|
903
|
+
"execution_id": execution_id,
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
redis_key = f"execution:{execution_id}:events"
|
|
907
|
+
await redis_client.lpush(redis_key, json.dumps(cancel_message_event))
|
|
908
|
+
await redis_client.ltrim(redis_key, 0, 999)
|
|
909
|
+
await redis_client.expire(redis_key, 3600)
|
|
910
|
+
|
|
911
|
+
# Also update status event
|
|
912
|
+
status_event = {
|
|
913
|
+
"event_type": "status",
|
|
914
|
+
"data": {"status": "cancelled", "execution_id": execution_id},
|
|
915
|
+
"timestamp": current_timestamp,
|
|
916
|
+
"execution_id": execution_id,
|
|
917
|
+
}
|
|
918
|
+
await redis_client.lpush(redis_key, json.dumps(status_event))
|
|
919
|
+
|
|
920
|
+
logger.debug("cancel_event_published_to_redis", execution_id=execution_id)
|
|
921
|
+
except Exception as redis_error:
|
|
922
|
+
logger.warning("failed_to_publish_cancel_event", error=str(redis_error), execution_id=execution_id)
|
|
923
|
+
|
|
924
|
+
logger.info(
|
|
925
|
+
"execution_cancelled_successfully",
|
|
926
|
+
execution_id=execution_id,
|
|
927
|
+
workflow_id=workflow_id
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
return {
|
|
931
|
+
"success": True,
|
|
932
|
+
"execution_id": execution_id,
|
|
933
|
+
"workflow_id": workflow_id,
|
|
934
|
+
"message": "Execution cancelled successfully",
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
except Exception as cancel_error:
|
|
938
|
+
logger.error(
|
|
939
|
+
"cancel_workflow_error",
|
|
940
|
+
execution_id=execution_id,
|
|
941
|
+
error=str(cancel_error)
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
# Mark as cancelled in database anyway (user intent matters)
|
|
945
|
+
(
|
|
946
|
+
client.table("executions")
|
|
947
|
+
.update({
|
|
948
|
+
"status": "cancelled",
|
|
949
|
+
"completed_at": datetime.utcnow().isoformat(),
|
|
950
|
+
"error_message": f"Cancelled: {str(cancel_error)}",
|
|
951
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
952
|
+
})
|
|
953
|
+
.eq("id", execution_id)
|
|
954
|
+
.eq("organization_id", organization["id"])
|
|
955
|
+
.execute()
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
return {
|
|
959
|
+
"success": True, # User intent succeeded
|
|
960
|
+
"execution_id": execution_id,
|
|
961
|
+
"message": "Execution marked as cancelled",
|
|
962
|
+
"warning": str(cancel_error),
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
except HTTPException:
|
|
966
|
+
raise
|
|
967
|
+
except Exception as e:
|
|
968
|
+
logger.error(
|
|
969
|
+
"cancel_execution_error",
|
|
970
|
+
execution_id=execution_id,
|
|
971
|
+
error=str(e)
|
|
972
|
+
)
|
|
973
|
+
raise HTTPException(
|
|
974
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
975
|
+
detail=f"Failed to cancel execution: {str(e)}"
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
@router.get("/{execution_id}/session")
|
|
980
|
+
async def get_session_history(
|
|
981
|
+
execution_id: str,
|
|
982
|
+
request: Request,
|
|
983
|
+
organization: dict = Depends(get_current_organization),
|
|
984
|
+
):
|
|
985
|
+
"""
|
|
986
|
+
Retrieve session history with Redis caching for hot loading.
|
|
987
|
+
|
|
988
|
+
Workers GET session messages before each run to restore conversation context.
|
|
989
|
+
|
|
990
|
+
Performance strategy:
|
|
991
|
+
1. Check Redis cache first (hot path - milliseconds)
|
|
992
|
+
2. Fall back to Supabase if not cached (cold path - ~50ms)
|
|
993
|
+
3. Cache the result in Redis for next access
|
|
994
|
+
"""
|
|
995
|
+
import json
|
|
996
|
+
try:
|
|
997
|
+
session_id = execution_id
|
|
998
|
+
redis_key = f"session:{session_id}"
|
|
999
|
+
|
|
1000
|
+
# Try Redis first for hot loading
|
|
1001
|
+
redis_client = get_redis_client()
|
|
1002
|
+
if redis_client:
|
|
1003
|
+
try:
|
|
1004
|
+
cached_session = await redis_client.get(redis_key)
|
|
1005
|
+
if cached_session:
|
|
1006
|
+
session_data = json.loads(cached_session)
|
|
1007
|
+
logger.info(
|
|
1008
|
+
"session_cache_hit",
|
|
1009
|
+
execution_id=execution_id,
|
|
1010
|
+
message_count=session_data.get("message_count", 0)
|
|
1011
|
+
)
|
|
1012
|
+
return session_data
|
|
1013
|
+
except Exception as redis_error:
|
|
1014
|
+
logger.warning("session_cache_error", error=str(redis_error))
|
|
1015
|
+
# Continue to DB fallback
|
|
1016
|
+
|
|
1017
|
+
# Redis miss or unavailable - load from Supabase
|
|
1018
|
+
client = get_supabase()
|
|
1019
|
+
|
|
1020
|
+
result = (
|
|
1021
|
+
client.table("sessions")
|
|
1022
|
+
.select("*")
|
|
1023
|
+
.eq("execution_id", execution_id)
|
|
1024
|
+
.eq("organization_id", organization["id"])
|
|
1025
|
+
.single()
|
|
1026
|
+
.execute()
|
|
1027
|
+
)
|
|
1028
|
+
|
|
1029
|
+
if not result.data:
|
|
1030
|
+
raise HTTPException(status_code=404, detail="Session not found")
|
|
1031
|
+
|
|
1032
|
+
session_record = result.data
|
|
1033
|
+
messages = session_record.get("messages", [])
|
|
1034
|
+
|
|
1035
|
+
session_data = {
|
|
1036
|
+
"session_id": session_record.get("session_id", execution_id),
|
|
1037
|
+
"execution_id": execution_id,
|
|
1038
|
+
"messages": messages,
|
|
1039
|
+
"message_count": len(messages),
|
|
1040
|
+
"metadata": session_record.get("metadata", {}),
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
# Cache in Redis for next access (TTL: 1 hour)
|
|
1044
|
+
if redis_client:
|
|
1045
|
+
try:
|
|
1046
|
+
await redis_client.setex(
|
|
1047
|
+
redis_key,
|
|
1048
|
+
3600, # 1 hour TTL
|
|
1049
|
+
json.dumps(session_data)
|
|
1050
|
+
)
|
|
1051
|
+
logger.info(
|
|
1052
|
+
"session_cached",
|
|
1053
|
+
execution_id=execution_id,
|
|
1054
|
+
message_count=len(messages)
|
|
1055
|
+
)
|
|
1056
|
+
except Exception as cache_error:
|
|
1057
|
+
logger.warning("session_cache_write_error", error=str(cache_error))
|
|
1058
|
+
|
|
1059
|
+
logger.info(
|
|
1060
|
+
"session_history_retrieved_from_supabase",
|
|
1061
|
+
execution_id=execution_id,
|
|
1062
|
+
session_id=session_record.get("session_id"),
|
|
1063
|
+
message_count=len(messages)
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
return session_data
|
|
1067
|
+
|
|
1068
|
+
except HTTPException:
|
|
1069
|
+
raise
|
|
1070
|
+
except Exception as e:
|
|
1071
|
+
logger.error(
|
|
1072
|
+
"failed_to_retrieve_session_history",
|
|
1073
|
+
execution_id=execution_id,
|
|
1074
|
+
error=str(e)
|
|
1075
|
+
)
|
|
1076
|
+
raise HTTPException(
|
|
1077
|
+
status_code=500,
|
|
1078
|
+
detail=f"Failed to retrieve session history: {str(e)}"
|
|
1079
|
+
)
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
@router.post("/{execution_id}/session", status_code=status.HTTP_201_CREATED)
|
|
1083
|
+
async def persist_session_history(
|
|
1084
|
+
execution_id: str,
|
|
1085
|
+
session_data: dict,
|
|
1086
|
+
request: Request,
|
|
1087
|
+
organization: dict = Depends(get_current_organization),
|
|
1088
|
+
):
|
|
1089
|
+
"""
|
|
1090
|
+
Persist session history from worker to Control Plane database.
|
|
1091
|
+
|
|
1092
|
+
Worker POSTs session messages after each run completion.
|
|
1093
|
+
This ensures history is available even when worker is offline.
|
|
1094
|
+
|
|
1095
|
+
Sessions are stored in Supabase for fast loading by the UI streaming endpoint.
|
|
1096
|
+
"""
|
|
1097
|
+
try:
|
|
1098
|
+
client = get_supabase()
|
|
1099
|
+
|
|
1100
|
+
session_id = session_data.get("session_id", execution_id)
|
|
1101
|
+
user_id = session_data.get("user_id")
|
|
1102
|
+
messages = session_data.get("messages", [])
|
|
1103
|
+
metadata = session_data.get("metadata", {})
|
|
1104
|
+
|
|
1105
|
+
logger.info(
|
|
1106
|
+
"persisting_session_history",
|
|
1107
|
+
execution_id=execution_id,
|
|
1108
|
+
session_id=session_id,
|
|
1109
|
+
user_id=user_id,
|
|
1110
|
+
message_count=len(messages),
|
|
1111
|
+
org_id=organization["id"],
|
|
1112
|
+
)
|
|
1113
|
+
|
|
1114
|
+
# Upsert to Supabase sessions table
|
|
1115
|
+
# This matches what the streaming endpoint expects to load
|
|
1116
|
+
session_record = {
|
|
1117
|
+
"execution_id": execution_id,
|
|
1118
|
+
"session_id": session_id,
|
|
1119
|
+
"organization_id": organization["id"],
|
|
1120
|
+
"user_id": user_id,
|
|
1121
|
+
"messages": messages,
|
|
1122
|
+
"metadata": metadata,
|
|
1123
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
result = (
|
|
1127
|
+
client.table("sessions")
|
|
1128
|
+
.upsert(session_record, on_conflict="execution_id")
|
|
1129
|
+
.execute()
|
|
1130
|
+
)
|
|
1131
|
+
|
|
1132
|
+
if not result.data:
|
|
1133
|
+
logger.error(
|
|
1134
|
+
"session_upsert_failed",
|
|
1135
|
+
execution_id=execution_id,
|
|
1136
|
+
session_id=session_id
|
|
1137
|
+
)
|
|
1138
|
+
return {
|
|
1139
|
+
"success": False,
|
|
1140
|
+
"error": "Failed to upsert session to database"
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
logger.info(
|
|
1144
|
+
"session_persisted_to_supabase",
|
|
1145
|
+
execution_id=execution_id,
|
|
1146
|
+
session_id=session_id,
|
|
1147
|
+
message_count=len(messages),
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
# Cache in Redis for hot loading on next access
|
|
1151
|
+
import json
|
|
1152
|
+
|
|
1153
|
+
redis_client = get_redis_client()
|
|
1154
|
+
if redis_client:
|
|
1155
|
+
try:
|
|
1156
|
+
redis_key = f"session:{session_id}"
|
|
1157
|
+
cache_data = {
|
|
1158
|
+
"session_id": session_id,
|
|
1159
|
+
"execution_id": execution_id,
|
|
1160
|
+
"messages": messages,
|
|
1161
|
+
"message_count": len(messages),
|
|
1162
|
+
}
|
|
1163
|
+
await redis_client.setex(
|
|
1164
|
+
redis_key,
|
|
1165
|
+
3600, # 1 hour TTL
|
|
1166
|
+
json.dumps(cache_data)
|
|
1167
|
+
)
|
|
1168
|
+
logger.info(
|
|
1169
|
+
"session_cached_on_write",
|
|
1170
|
+
execution_id=execution_id,
|
|
1171
|
+
message_count=len(messages)
|
|
1172
|
+
)
|
|
1173
|
+
except Exception as cache_error:
|
|
1174
|
+
logger.warning("session_cache_write_error_on_persist", error=str(cache_error))
|
|
1175
|
+
# Don't fail persistence if caching fails
|
|
1176
|
+
|
|
1177
|
+
return {
|
|
1178
|
+
"success": True,
|
|
1179
|
+
"execution_id": execution_id,
|
|
1180
|
+
"session_id": session_id,
|
|
1181
|
+
"persisted_messages": len(messages),
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
except Exception as e:
|
|
1185
|
+
logger.error(
|
|
1186
|
+
"session_persistence_failed",
|
|
1187
|
+
error=str(e),
|
|
1188
|
+
execution_id=execution_id,
|
|
1189
|
+
)
|
|
1190
|
+
return {
|
|
1191
|
+
"success": False,
|
|
1192
|
+
"error": str(e),
|
|
1193
|
+
}
|
|
1194
|
+
|
|
1195
|
+
|
|
1196
|
+
@router.post("/{execution_id}/mark-done", status_code=status.HTTP_202_ACCEPTED)
|
|
1197
|
+
async def mark_execution_as_done(
|
|
1198
|
+
execution_id: str,
|
|
1199
|
+
request: Request,
|
|
1200
|
+
organization: dict = Depends(get_current_organization),
|
|
1201
|
+
):
|
|
1202
|
+
"""
|
|
1203
|
+
Mark an execution as done, signaling the workflow to complete.
|
|
1204
|
+
|
|
1205
|
+
This sends a signal to the Temporal workflow to indicate the user is finished
|
|
1206
|
+
with the conversation. The workflow will complete gracefully after this signal.
|
|
1207
|
+
"""
|
|
1208
|
+
try:
|
|
1209
|
+
# Get Temporal client
|
|
1210
|
+
temporal_client = await get_temporal_client()
|
|
1211
|
+
|
|
1212
|
+
# Verify the execution belongs to this organization and get execution type
|
|
1213
|
+
client = get_supabase()
|
|
1214
|
+
result = (
|
|
1215
|
+
client.table("executions")
|
|
1216
|
+
.select("id, organization_id, status, execution_type")
|
|
1217
|
+
.eq("id", execution_id)
|
|
1218
|
+
.eq("organization_id", organization["id"])
|
|
1219
|
+
.single()
|
|
1220
|
+
.execute()
|
|
1221
|
+
)
|
|
1222
|
+
|
|
1223
|
+
if not result.data:
|
|
1224
|
+
raise HTTPException(status_code=404, detail="Execution not found")
|
|
1225
|
+
|
|
1226
|
+
# Construct workflow ID based on execution type
|
|
1227
|
+
execution_type = result.data.get("execution_type", "AGENT")
|
|
1228
|
+
if execution_type == "TEAM":
|
|
1229
|
+
workflow_id = f"team-execution-{execution_id}"
|
|
1230
|
+
else:
|
|
1231
|
+
workflow_id = f"agent-execution-{execution_id}"
|
|
1232
|
+
|
|
1233
|
+
workflow_handle = temporal_client.get_workflow_handle(workflow_id)
|
|
1234
|
+
|
|
1235
|
+
# Send mark_as_done signal to workflow
|
|
1236
|
+
await workflow_handle.signal(AgentExecutionWorkflow.mark_as_done)
|
|
1237
|
+
|
|
1238
|
+
logger.info(
|
|
1239
|
+
"execution_marked_as_done",
|
|
1240
|
+
execution_id=execution_id,
|
|
1241
|
+
org_id=organization["id"],
|
|
1242
|
+
)
|
|
1243
|
+
|
|
1244
|
+
return {
|
|
1245
|
+
"success": True,
|
|
1246
|
+
"execution_id": execution_id,
|
|
1247
|
+
"message": "Execution marked as done, workflow will complete",
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
except HTTPException:
|
|
1251
|
+
raise
|
|
1252
|
+
except Exception as e:
|
|
1253
|
+
logger.error(
|
|
1254
|
+
"mark_as_done_failed",
|
|
1255
|
+
error=str(e),
|
|
1256
|
+
execution_id=execution_id,
|
|
1257
|
+
)
|
|
1258
|
+
raise HTTPException(
|
|
1259
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
1260
|
+
detail=f"Failed to mark execution as done: {str(e)}"
|
|
1261
|
+
)
|
|
1262
|
+
|
|
1263
|
+
|
|
1264
|
+
class StreamingEventRequest(BaseModel):
|
|
1265
|
+
"""Request to publish a streaming event to Redis for real-time UI updates"""
|
|
1266
|
+
event_type: str # "status", "message", "tool_started", "tool_completed", "error"
|
|
1267
|
+
data: dict # Event payload
|
|
1268
|
+
timestamp: str | None = None
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
@router.post("/{execution_id}/events", status_code=status.HTTP_202_ACCEPTED)
|
|
1272
|
+
async def publish_execution_event(
|
|
1273
|
+
execution_id: str,
|
|
1274
|
+
event: StreamingEventRequest,
|
|
1275
|
+
request: Request,
|
|
1276
|
+
organization: dict = Depends(get_current_organization),
|
|
1277
|
+
):
|
|
1278
|
+
"""
|
|
1279
|
+
Publish a streaming event to Redis for real-time UI updates.
|
|
1280
|
+
|
|
1281
|
+
This endpoint is used by workers to send real-time events (tool execution, status updates, etc.)
|
|
1282
|
+
that are streamed to the UI via SSE without waiting for Temporal workflow completion.
|
|
1283
|
+
|
|
1284
|
+
Events are stored in Redis list: execution:{execution_id}:events
|
|
1285
|
+
TTL: 1 hour (events are temporary, final state persists in database)
|
|
1286
|
+
"""
|
|
1287
|
+
try:
|
|
1288
|
+
redis_client = get_redis_client()
|
|
1289
|
+
if not redis_client:
|
|
1290
|
+
# Redis not configured - skip streaming but don't fail
|
|
1291
|
+
logger.warning("redis_not_configured_for_streaming", execution_id=execution_id)
|
|
1292
|
+
return {"success": True, "message": "Redis not configured, event skipped"}
|
|
1293
|
+
|
|
1294
|
+
# Skip database verification for performance - authentication already validates organization
|
|
1295
|
+
# Streaming events are temporary (1hr TTL) and don't need strict validation
|
|
1296
|
+
# The worker is already authenticated via API key which validates organization
|
|
1297
|
+
|
|
1298
|
+
# Build event payload
|
|
1299
|
+
event_data = {
|
|
1300
|
+
"event_type": event.event_type,
|
|
1301
|
+
"data": event.data,
|
|
1302
|
+
"timestamp": event.timestamp or datetime.utcnow().isoformat(),
|
|
1303
|
+
"execution_id": execution_id,
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
# Push event to Redis list (most recent at head) - this must be FAST
|
|
1307
|
+
redis_key = f"execution:{execution_id}:events"
|
|
1308
|
+
await redis_client.lpush(redis_key, json.dumps(event_data))
|
|
1309
|
+
|
|
1310
|
+
# Keep only last 1000 events (prevent memory issues)
|
|
1311
|
+
await redis_client.ltrim(redis_key, 0, 999)
|
|
1312
|
+
|
|
1313
|
+
# Set TTL: 1 hour (events are temporary)
|
|
1314
|
+
await redis_client.expire(redis_key, 3600)
|
|
1315
|
+
|
|
1316
|
+
# Also publish to pub/sub channel for real-time streaming
|
|
1317
|
+
# This allows connected SSE clients to receive updates instantly
|
|
1318
|
+
pubsub_channel = f"execution:{execution_id}:stream"
|
|
1319
|
+
try:
|
|
1320
|
+
await redis_client.publish(pubsub_channel, json.dumps(event_data))
|
|
1321
|
+
except Exception as pubsub_error:
|
|
1322
|
+
# Don't fail if pub/sub fails - the list storage is the primary mechanism
|
|
1323
|
+
logger.debug("pubsub_publish_failed", error=str(pubsub_error), execution_id=execution_id[:8])
|
|
1324
|
+
|
|
1325
|
+
logger.info(
|
|
1326
|
+
"execution_event_published",
|
|
1327
|
+
execution_id=execution_id[:8],
|
|
1328
|
+
event_type=event.event_type,
|
|
1329
|
+
)
|
|
1330
|
+
|
|
1331
|
+
return {
|
|
1332
|
+
"success": True,
|
|
1333
|
+
"execution_id": execution_id,
|
|
1334
|
+
"event_type": event.event_type,
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
except HTTPException:
|
|
1338
|
+
raise
|
|
1339
|
+
except Exception as e:
|
|
1340
|
+
logger.error(
|
|
1341
|
+
"publish_event_failed",
|
|
1342
|
+
error=str(e),
|
|
1343
|
+
execution_id=execution_id,
|
|
1344
|
+
event_type=event.event_type,
|
|
1345
|
+
)
|
|
1346
|
+
# Don't fail the worker if streaming fails - it's not critical
|
|
1347
|
+
return {
|
|
1348
|
+
"success": False,
|
|
1349
|
+
"error": str(e),
|
|
1350
|
+
"message": "Event publishing failed but execution continues"
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
|
|
1354
|
+
@router.get("/{execution_id}/stream")
|
|
1355
|
+
async def stream_execution(
|
|
1356
|
+
execution_id: str,
|
|
1357
|
+
request: Request,
|
|
1358
|
+
organization: dict = Depends(get_current_organization),
|
|
1359
|
+
):
|
|
1360
|
+
"""
|
|
1361
|
+
Stream execution updates using Server-Sent Events (SSE).
|
|
1362
|
+
|
|
1363
|
+
This endpoint combines two sources for real-time updates:
|
|
1364
|
+
1. Redis streaming events (from worker activities) - sub-second latency
|
|
1365
|
+
2. Temporal workflow queries (for state consistency) - 200ms polling
|
|
1366
|
+
|
|
1367
|
+
The Redis events provide instant tool execution updates, while Temporal
|
|
1368
|
+
ensures we never miss state changes even if Redis is unavailable.
|
|
1369
|
+
|
|
1370
|
+
SSE format:
|
|
1371
|
+
- data: {json object with execution status, messages, tool calls}
|
|
1372
|
+
- event: status|message|tool_started|tool_completed|error|done
|
|
1373
|
+
"""
|
|
1374
|
+
|
|
1375
|
+
async def generate_sse():
|
|
1376
|
+
"""Generate Server-Sent Events from Agno session and Temporal workflow state"""
|
|
1377
|
+
import time
|
|
1378
|
+
start_time = time.time()
|
|
1379
|
+
|
|
1380
|
+
try:
|
|
1381
|
+
# Get Temporal client
|
|
1382
|
+
t0 = time.time()
|
|
1383
|
+
temporal_client = await get_temporal_client()
|
|
1384
|
+
logger.info("timing_temporal_client", duration_ms=int((time.time() - t0) * 1000), execution_id=execution_id)
|
|
1385
|
+
|
|
1386
|
+
# Check Redis cache first for execution_type (fast, sub-millisecond)
|
|
1387
|
+
execution_type = None
|
|
1388
|
+
redis_client = get_redis_client()
|
|
1389
|
+
|
|
1390
|
+
if redis_client:
|
|
1391
|
+
try:
|
|
1392
|
+
t0 = time.time()
|
|
1393
|
+
# Check if we have metadata event in Redis
|
|
1394
|
+
redis_key = f"execution:{execution_id}:events"
|
|
1395
|
+
redis_events = await redis_client.lrange(redis_key, 0, -1)
|
|
1396
|
+
|
|
1397
|
+
# Look for metadata event with execution_type
|
|
1398
|
+
if redis_events:
|
|
1399
|
+
for event_json in redis_events:
|
|
1400
|
+
try:
|
|
1401
|
+
event_data = json.loads(event_json)
|
|
1402
|
+
if event_data.get("event_type") == "metadata" and event_data.get("data", {}).get("execution_type"):
|
|
1403
|
+
execution_type = event_data["data"]["execution_type"]
|
|
1404
|
+
logger.info("timing_redis_cache_hit", duration_ms=int((time.time() - t0) * 1000), execution_id=execution_id, execution_type=execution_type)
|
|
1405
|
+
break
|
|
1406
|
+
except json.JSONDecodeError:
|
|
1407
|
+
continue
|
|
1408
|
+
except Exception as redis_error:
|
|
1409
|
+
logger.warning("redis_cache_lookup_failed", error=str(redis_error), execution_id=execution_id)
|
|
1410
|
+
|
|
1411
|
+
# Fall back to database if not in cache
|
|
1412
|
+
if not execution_type:
|
|
1413
|
+
t0 = time.time()
|
|
1414
|
+
client = get_supabase()
|
|
1415
|
+
exec_result = (
|
|
1416
|
+
client.table("executions")
|
|
1417
|
+
.select("id, execution_type")
|
|
1418
|
+
.eq("id", execution_id)
|
|
1419
|
+
.eq("organization_id", organization["id"])
|
|
1420
|
+
.single()
|
|
1421
|
+
.execute()
|
|
1422
|
+
)
|
|
1423
|
+
logger.info("timing_db_query_fallback", duration_ms=int((time.time() - t0) * 1000), execution_id=execution_id)
|
|
1424
|
+
|
|
1425
|
+
if not exec_result.data:
|
|
1426
|
+
raise HTTPException(status_code=404, detail="Execution not found")
|
|
1427
|
+
|
|
1428
|
+
execution_type = exec_result.data.get("execution_type", "AGENT")
|
|
1429
|
+
|
|
1430
|
+
# Construct workflow ID based on execution type
|
|
1431
|
+
# Team executions use "team-execution-{id}", agent executions use "agent-execution-{id}"
|
|
1432
|
+
if execution_type == "TEAM":
|
|
1433
|
+
workflow_id = f"team-execution-{execution_id}"
|
|
1434
|
+
else:
|
|
1435
|
+
workflow_id = f"agent-execution-{execution_id}"
|
|
1436
|
+
|
|
1437
|
+
workflow_handle = temporal_client.get_workflow_handle(workflow_id)
|
|
1438
|
+
|
|
1439
|
+
logger.info(
|
|
1440
|
+
"execution_stream_connecting",
|
|
1441
|
+
execution_id=execution_id,
|
|
1442
|
+
execution_type=execution_type,
|
|
1443
|
+
workflow_id=workflow_id,
|
|
1444
|
+
)
|
|
1445
|
+
|
|
1446
|
+
last_status = None
|
|
1447
|
+
last_message_count = 0
|
|
1448
|
+
last_keepalive = asyncio.get_event_loop().time()
|
|
1449
|
+
last_redis_event_index = -1 # Track which Redis events we've sent
|
|
1450
|
+
consecutive_failures = 0 # Track consecutive workflow query failures
|
|
1451
|
+
worker_down_mode = False # Track if we're in worker-down fallback mode
|
|
1452
|
+
last_db_poll = 0 # Track last database poll time
|
|
1453
|
+
|
|
1454
|
+
# Check if worker is ACTIVELY processing by checking Temporal workflow execution status
|
|
1455
|
+
# This is much more performant than querying workflow state - it's just a metadata lookup
|
|
1456
|
+
# We only stream from Redis if workflow is RUNNING at Temporal level (worker is active)
|
|
1457
|
+
# Otherwise, we load from database (workflow completed/failed/no active worker)
|
|
1458
|
+
is_workflow_running = False
|
|
1459
|
+
try:
|
|
1460
|
+
t0 = time.time()
|
|
1461
|
+
description = await workflow_handle.describe()
|
|
1462
|
+
# Temporal execution status: RUNNING, COMPLETED, FAILED, CANCELLED, TERMINATED, TIMED_OUT, CONTINUED_AS_NEW
|
|
1463
|
+
# Use .name to get just the enum name (e.g., "RUNNING") without the class prefix
|
|
1464
|
+
temporal_status_name = description.status.name
|
|
1465
|
+
is_workflow_running = temporal_status_name == "RUNNING"
|
|
1466
|
+
logger.info(
|
|
1467
|
+
"initial_workflow_status",
|
|
1468
|
+
execution_id=execution_id,
|
|
1469
|
+
temporal_status=temporal_status_name,
|
|
1470
|
+
temporal_status_full=str(description.status),
|
|
1471
|
+
is_running=is_workflow_running,
|
|
1472
|
+
duration_ms=int((time.time() - t0) * 1000)
|
|
1473
|
+
)
|
|
1474
|
+
except Exception as describe_error:
|
|
1475
|
+
# If we can't describe workflow, assume it's not running
|
|
1476
|
+
logger.warning("initial_workflow_describe_failed", execution_id=execution_id, error=str(describe_error))
|
|
1477
|
+
is_workflow_running = False
|
|
1478
|
+
|
|
1479
|
+
# ALWAYS load historical messages from database first
|
|
1480
|
+
# This ensures UI sees conversation history even when connecting mid-execution
|
|
1481
|
+
# In streaming mode, we'll then continue with Redis for real-time updates
|
|
1482
|
+
t0 = time.time()
|
|
1483
|
+
try:
|
|
1484
|
+
# Read session from Control Plane database (where worker persists)
|
|
1485
|
+
client = get_supabase()
|
|
1486
|
+
session_result = (
|
|
1487
|
+
client.table("sessions")
|
|
1488
|
+
.select("messages")
|
|
1489
|
+
.eq("execution_id", execution_id)
|
|
1490
|
+
.order("updated_at", desc=True)
|
|
1491
|
+
.limit(1)
|
|
1492
|
+
.execute()
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
session_messages = []
|
|
1496
|
+
if session_result.data and len(session_result.data) > 0:
|
|
1497
|
+
messages_data = session_result.data[0].get("messages", [])
|
|
1498
|
+
# Convert dict messages to objects with attributes
|
|
1499
|
+
from dataclasses import dataclass, field
|
|
1500
|
+
from typing import Optional as Opt
|
|
1501
|
+
|
|
1502
|
+
@dataclass
|
|
1503
|
+
class SessionMessage:
|
|
1504
|
+
role: str
|
|
1505
|
+
content: str
|
|
1506
|
+
timestamp: Opt[str] = None
|
|
1507
|
+
user_id: Opt[str] = None
|
|
1508
|
+
user_name: Opt[str] = None
|
|
1509
|
+
user_email: Opt[str] = None
|
|
1510
|
+
user_avatar: Opt[str] = None
|
|
1511
|
+
|
|
1512
|
+
session_messages = [SessionMessage(**msg) for msg in messages_data]
|
|
1513
|
+
|
|
1514
|
+
if session_messages:
|
|
1515
|
+
logger.info(
|
|
1516
|
+
"sending_session_history_on_connect",
|
|
1517
|
+
execution_id=execution_id,
|
|
1518
|
+
message_count=len(session_messages)
|
|
1519
|
+
)
|
|
1520
|
+
|
|
1521
|
+
# Send all existing messages immediately
|
|
1522
|
+
for msg in session_messages:
|
|
1523
|
+
msg_data = {
|
|
1524
|
+
"role": msg.role,
|
|
1525
|
+
"content": msg.content,
|
|
1526
|
+
"timestamp": msg.timestamp, # Already in ISO format from database
|
|
1527
|
+
}
|
|
1528
|
+
# Include user attribution if available
|
|
1529
|
+
if msg.user_id:
|
|
1530
|
+
msg_data["user_id"] = msg.user_id
|
|
1531
|
+
msg_data["user_name"] = msg.user_name
|
|
1532
|
+
msg_data["user_email"] = msg.user_email
|
|
1533
|
+
msg_data["user_avatar"] = msg.user_avatar
|
|
1534
|
+
yield f"event: message\n"
|
|
1535
|
+
yield f"data: {json.dumps(msg_data)}\n\n"
|
|
1536
|
+
|
|
1537
|
+
last_message_count = len(session_messages)
|
|
1538
|
+
|
|
1539
|
+
logger.info("timing_session_history_load", duration_ms=int((time.time() - t0) * 1000), execution_id=execution_id, message_count=last_message_count)
|
|
1540
|
+
|
|
1541
|
+
except Exception as session_error:
|
|
1542
|
+
logger.warning(
|
|
1543
|
+
"failed_to_load_session_history",
|
|
1544
|
+
execution_id=execution_id,
|
|
1545
|
+
error=str(session_error),
|
|
1546
|
+
duration_ms=int((time.time() - t0) * 1000)
|
|
1547
|
+
)
|
|
1548
|
+
# Continue even if session loading fails - workflow state will still work
|
|
1549
|
+
|
|
1550
|
+
while True:
|
|
1551
|
+
# Check if client disconnected
|
|
1552
|
+
if await request.is_disconnected():
|
|
1553
|
+
logger.info("execution_stream_disconnected", execution_id=execution_id)
|
|
1554
|
+
break
|
|
1555
|
+
|
|
1556
|
+
# Send keepalive comment every 15 seconds to prevent timeout
|
|
1557
|
+
current_time = asyncio.get_event_loop().time()
|
|
1558
|
+
if current_time - last_keepalive > 15:
|
|
1559
|
+
yield ": keepalive\n\n"
|
|
1560
|
+
last_keepalive = current_time
|
|
1561
|
+
|
|
1562
|
+
# FIRST: Check Redis for NEW real-time streaming events (sub-second latency)
|
|
1563
|
+
# ONLY if workflow is actively running (worker is connected and processing)
|
|
1564
|
+
# We track which events we've sent to avoid re-sending
|
|
1565
|
+
if is_workflow_running and redis_client:
|
|
1566
|
+
try:
|
|
1567
|
+
redis_key = f"execution:{execution_id}:events"
|
|
1568
|
+
# Get the total count of events in Redis
|
|
1569
|
+
total_events = await redis_client.llen(redis_key)
|
|
1570
|
+
|
|
1571
|
+
if total_events and total_events > (last_redis_event_index + 1):
|
|
1572
|
+
# There are new events we haven't sent yet
|
|
1573
|
+
logger.debug(
|
|
1574
|
+
"redis_new_events_found",
|
|
1575
|
+
execution_id=execution_id,
|
|
1576
|
+
total=total_events,
|
|
1577
|
+
last_index=last_redis_event_index
|
|
1578
|
+
)
|
|
1579
|
+
|
|
1580
|
+
# Get all events (they're in reverse chronological order from LPUSH)
|
|
1581
|
+
all_redis_events = await redis_client.lrange(redis_key, 0, -1)
|
|
1582
|
+
|
|
1583
|
+
if all_redis_events:
|
|
1584
|
+
# Reverse to get chronological order (oldest first)
|
|
1585
|
+
chronological_events = list(reversed(all_redis_events))
|
|
1586
|
+
|
|
1587
|
+
# Send only NEW events we haven't sent yet
|
|
1588
|
+
for i in range(last_redis_event_index + 1, len(chronological_events)):
|
|
1589
|
+
event_json = chronological_events[i]
|
|
1590
|
+
|
|
1591
|
+
try:
|
|
1592
|
+
event_data = json.loads(event_json)
|
|
1593
|
+
event_type = event_data.get("event_type", "message")
|
|
1594
|
+
|
|
1595
|
+
# For message events with wrapped data (pause/resume/cancel system messages),
|
|
1596
|
+
# extract just the message data. For other events, send as-is.
|
|
1597
|
+
if event_type == "message" and "data" in event_data and isinstance(event_data["data"], dict) and "role" in event_data["data"]:
|
|
1598
|
+
# This is a new-style system message with role/content in data field
|
|
1599
|
+
payload = event_data["data"]
|
|
1600
|
+
else:
|
|
1601
|
+
# This is an existing event format - send the whole event_data
|
|
1602
|
+
payload = event_data
|
|
1603
|
+
|
|
1604
|
+
# Stream the event to UI
|
|
1605
|
+
yield f"event: {event_type}\n"
|
|
1606
|
+
yield f"data: {json.dumps(payload)}\n\n"
|
|
1607
|
+
|
|
1608
|
+
last_redis_event_index = i
|
|
1609
|
+
|
|
1610
|
+
logger.debug(
|
|
1611
|
+
"redis_event_streamed",
|
|
1612
|
+
execution_id=execution_id,
|
|
1613
|
+
event_type=event_type,
|
|
1614
|
+
index=i
|
|
1615
|
+
)
|
|
1616
|
+
|
|
1617
|
+
except json.JSONDecodeError:
|
|
1618
|
+
logger.warning("invalid_redis_event_json", event=event_json[:100])
|
|
1619
|
+
continue
|
|
1620
|
+
|
|
1621
|
+
except Exception as redis_error:
|
|
1622
|
+
logger.error("redis_event_read_failed", error=str(redis_error), execution_id=execution_id)
|
|
1623
|
+
# Continue with Temporal polling even if Redis fails
|
|
1624
|
+
|
|
1625
|
+
try:
|
|
1626
|
+
# SECOND: Check Temporal workflow execution status (lightweight metadata lookup)
|
|
1627
|
+
t0 = time.time()
|
|
1628
|
+
description = await workflow_handle.describe()
|
|
1629
|
+
temporal_status = description.status.name # Get enum name (e.g., "RUNNING")
|
|
1630
|
+
describe_duration = int((time.time() - t0) * 1000)
|
|
1631
|
+
|
|
1632
|
+
# Log slow describe calls (>100ms)
|
|
1633
|
+
if describe_duration > 100:
|
|
1634
|
+
logger.warning("slow_temporal_describe", duration_ms=describe_duration, execution_id=execution_id)
|
|
1635
|
+
|
|
1636
|
+
# Update is_workflow_running based on Temporal execution status
|
|
1637
|
+
# Only stream from Redis when workflow is actively being processed by a worker
|
|
1638
|
+
previous_running_state = is_workflow_running
|
|
1639
|
+
is_workflow_running = temporal_status == "RUNNING"
|
|
1640
|
+
|
|
1641
|
+
# Log when streaming mode changes
|
|
1642
|
+
if previous_running_state != is_workflow_running:
|
|
1643
|
+
logger.info(
|
|
1644
|
+
"streaming_mode_changed",
|
|
1645
|
+
execution_id=execution_id,
|
|
1646
|
+
temporal_status=temporal_status,
|
|
1647
|
+
is_workflow_running=is_workflow_running,
|
|
1648
|
+
mode="redis_streaming" if is_workflow_running else "database_only"
|
|
1649
|
+
)
|
|
1650
|
+
|
|
1651
|
+
# If workflow finished, send appropriate event and exit
|
|
1652
|
+
if temporal_status in ["COMPLETED", "FAILED", "TERMINATED", "CANCELLED"]:
|
|
1653
|
+
# Query workflow state one final time to get the complete results
|
|
1654
|
+
try:
|
|
1655
|
+
state = await workflow_handle.query(AgentExecutionWorkflow.get_state)
|
|
1656
|
+
|
|
1657
|
+
if temporal_status in ["COMPLETED", "TERMINATED"]:
|
|
1658
|
+
done_data = {
|
|
1659
|
+
"execution_id": execution_id,
|
|
1660
|
+
"status": "completed",
|
|
1661
|
+
"response": state.current_response,
|
|
1662
|
+
"usage": state.usage,
|
|
1663
|
+
"metadata": state.metadata,
|
|
1664
|
+
}
|
|
1665
|
+
yield f"event: done\n"
|
|
1666
|
+
yield f"data: {json.dumps(done_data)}\n\n"
|
|
1667
|
+
else: # FAILED or CANCELLED
|
|
1668
|
+
error_data = {
|
|
1669
|
+
"error": state.error_message or f"Workflow {temporal_status.lower()}",
|
|
1670
|
+
"execution_id": execution_id,
|
|
1671
|
+
"status": "failed",
|
|
1672
|
+
}
|
|
1673
|
+
if state.metadata.get("error_type"):
|
|
1674
|
+
error_data["error_type"] = state.metadata["error_type"]
|
|
1675
|
+
yield f"event: error\n"
|
|
1676
|
+
yield f"data: {json.dumps(error_data)}\n\n"
|
|
1677
|
+
except Exception as final_query_error:
|
|
1678
|
+
# If we can't query for final state, fall back to database
|
|
1679
|
+
logger.warning("final_state_query_failed", execution_id=execution_id, error=str(final_query_error))
|
|
1680
|
+
|
|
1681
|
+
# Try to get final status from database
|
|
1682
|
+
try:
|
|
1683
|
+
exec_result = (
|
|
1684
|
+
client.table("executions")
|
|
1685
|
+
.select("status, response, error_message, usage, execution_metadata")
|
|
1686
|
+
.eq("id", execution_id)
|
|
1687
|
+
.single()
|
|
1688
|
+
.execute()
|
|
1689
|
+
)
|
|
1690
|
+
|
|
1691
|
+
if exec_result.data:
|
|
1692
|
+
if temporal_status in ["COMPLETED", "TERMINATED"]:
|
|
1693
|
+
done_data = {
|
|
1694
|
+
"execution_id": execution_id,
|
|
1695
|
+
"status": exec_result.data.get("status", "completed"),
|
|
1696
|
+
"response": exec_result.data.get("response"),
|
|
1697
|
+
"usage": exec_result.data.get("usage", {}),
|
|
1698
|
+
"metadata": exec_result.data.get("execution_metadata", {}),
|
|
1699
|
+
}
|
|
1700
|
+
yield f"event: done\n"
|
|
1701
|
+
yield f"data: {json.dumps(done_data)}\n\n"
|
|
1702
|
+
else:
|
|
1703
|
+
error_data = {
|
|
1704
|
+
"error": exec_result.data.get("error_message") or f"Workflow {temporal_status.lower()}",
|
|
1705
|
+
"execution_id": execution_id,
|
|
1706
|
+
"status": exec_result.data.get("status", "failed"),
|
|
1707
|
+
}
|
|
1708
|
+
yield f"event: error\n"
|
|
1709
|
+
yield f"data: {json.dumps(error_data)}\n\n"
|
|
1710
|
+
else:
|
|
1711
|
+
yield f"event: done\n"
|
|
1712
|
+
yield f"data: {json.dumps({'execution_id': execution_id, 'workflow_status': temporal_status})}\n\n"
|
|
1713
|
+
except Exception as db_error:
|
|
1714
|
+
logger.error("database_fallback_failed", execution_id=execution_id, error=str(db_error))
|
|
1715
|
+
yield f"event: done\n"
|
|
1716
|
+
yield f"data: {json.dumps({'execution_id': execution_id, 'workflow_status': temporal_status})}\n\n"
|
|
1717
|
+
break
|
|
1718
|
+
|
|
1719
|
+
# THIRD: Query workflow state for application-level details (messages, usage, etc.)
|
|
1720
|
+
# Only do this if workflow is still running to get incremental updates
|
|
1721
|
+
try:
|
|
1722
|
+
state = await workflow_handle.query(AgentExecutionWorkflow.get_state)
|
|
1723
|
+
|
|
1724
|
+
# Reset failure counter on successful query
|
|
1725
|
+
if consecutive_failures > 0:
|
|
1726
|
+
logger.info(
|
|
1727
|
+
"workflow_query_recovered",
|
|
1728
|
+
execution_id=execution_id,
|
|
1729
|
+
failures=consecutive_failures
|
|
1730
|
+
)
|
|
1731
|
+
consecutive_failures = 0
|
|
1732
|
+
worker_down_mode = False
|
|
1733
|
+
|
|
1734
|
+
# Send status update if changed
|
|
1735
|
+
if state.status != last_status:
|
|
1736
|
+
yield f"event: status\n"
|
|
1737
|
+
yield f"data: {json.dumps({'status': state.status, 'execution_id': execution_id})}\n\n"
|
|
1738
|
+
last_status = state.status
|
|
1739
|
+
|
|
1740
|
+
logger.info(
|
|
1741
|
+
"execution_status_update",
|
|
1742
|
+
execution_id=execution_id,
|
|
1743
|
+
status=state.status
|
|
1744
|
+
)
|
|
1745
|
+
|
|
1746
|
+
# Send new messages incrementally
|
|
1747
|
+
# Skip assistant messages - they're already streamed via message_chunk events
|
|
1748
|
+
if len(state.messages) > last_message_count:
|
|
1749
|
+
new_messages = state.messages[last_message_count:]
|
|
1750
|
+
for msg in new_messages:
|
|
1751
|
+
# Skip assistant messages to prevent duplicates with chunk streaming
|
|
1752
|
+
if msg.role == "assistant":
|
|
1753
|
+
continue
|
|
1754
|
+
|
|
1755
|
+
msg_data = {
|
|
1756
|
+
"role": msg.role,
|
|
1757
|
+
"content": msg.content,
|
|
1758
|
+
"timestamp": msg.timestamp,
|
|
1759
|
+
}
|
|
1760
|
+
if msg.tool_name:
|
|
1761
|
+
msg_data["tool_name"] = msg.tool_name
|
|
1762
|
+
msg_data["tool_input"] = msg.tool_input
|
|
1763
|
+
msg_data["tool_output"] = msg.tool_output
|
|
1764
|
+
# Include user attribution for messages
|
|
1765
|
+
if hasattr(msg, 'user_id') and msg.user_id:
|
|
1766
|
+
msg_data["user_id"] = msg.user_id
|
|
1767
|
+
msg_data["user_name"] = msg.user_name
|
|
1768
|
+
msg_data["user_email"] = msg.user_email
|
|
1769
|
+
msg_data["user_avatar"] = msg.user_avatar
|
|
1770
|
+
|
|
1771
|
+
yield f"event: message\n"
|
|
1772
|
+
yield f"data: {json.dumps(msg_data)}\n\n"
|
|
1773
|
+
|
|
1774
|
+
last_message_count = len(state.messages)
|
|
1775
|
+
|
|
1776
|
+
except Exception as query_error:
|
|
1777
|
+
# Workflow query failed - track failures and switch to database fallback
|
|
1778
|
+
consecutive_failures += 1
|
|
1779
|
+
error_msg = str(query_error)
|
|
1780
|
+
|
|
1781
|
+
# Detect worker down condition
|
|
1782
|
+
is_worker_down = "no poller seen" in error_msg or "workflow not found" in error_msg
|
|
1783
|
+
|
|
1784
|
+
if consecutive_failures >= 3 and not worker_down_mode:
|
|
1785
|
+
worker_down_mode = True
|
|
1786
|
+
logger.warning(
|
|
1787
|
+
"worker_down_detected_switching_to_database_mode",
|
|
1788
|
+
execution_id=execution_id,
|
|
1789
|
+
failures=consecutive_failures,
|
|
1790
|
+
error=error_msg
|
|
1791
|
+
)
|
|
1792
|
+
|
|
1793
|
+
# In worker down mode, poll database for updates
|
|
1794
|
+
if worker_down_mode:
|
|
1795
|
+
current_time = time.time()
|
|
1796
|
+
# Poll database every 2 seconds when worker is down
|
|
1797
|
+
if current_time - last_db_poll >= 2.0:
|
|
1798
|
+
try:
|
|
1799
|
+
# Check execution status from database
|
|
1800
|
+
exec_result = (
|
|
1801
|
+
client.table("executions")
|
|
1802
|
+
.select("status, response, error_message")
|
|
1803
|
+
.eq("id", execution_id)
|
|
1804
|
+
.single()
|
|
1805
|
+
.execute()
|
|
1806
|
+
)
|
|
1807
|
+
|
|
1808
|
+
if exec_result.data:
|
|
1809
|
+
db_status = exec_result.data.get("status")
|
|
1810
|
+
|
|
1811
|
+
# Send status update if changed
|
|
1812
|
+
if db_status and db_status != last_status:
|
|
1813
|
+
yield f"event: status\n"
|
|
1814
|
+
yield f"data: {json.dumps({'status': db_status, 'execution_id': execution_id, 'source': 'database'})}\n\n"
|
|
1815
|
+
last_status = db_status
|
|
1816
|
+
|
|
1817
|
+
logger.info(
|
|
1818
|
+
"database_status_update",
|
|
1819
|
+
execution_id=execution_id,
|
|
1820
|
+
status=db_status
|
|
1821
|
+
)
|
|
1822
|
+
|
|
1823
|
+
# Check if execution finished
|
|
1824
|
+
if db_status in ["completed", "failed", "cancelled"]:
|
|
1825
|
+
if db_status == "completed":
|
|
1826
|
+
done_data = {
|
|
1827
|
+
"execution_id": execution_id,
|
|
1828
|
+
"status": db_status,
|
|
1829
|
+
"response": exec_result.data.get("response"),
|
|
1830
|
+
}
|
|
1831
|
+
yield f"event: done\n"
|
|
1832
|
+
yield f"data: {json.dumps(done_data)}\n\n"
|
|
1833
|
+
else:
|
|
1834
|
+
error_data = {
|
|
1835
|
+
"error": exec_result.data.get("error_message") or f"Execution {db_status}",
|
|
1836
|
+
"execution_id": execution_id,
|
|
1837
|
+
"status": db_status,
|
|
1838
|
+
}
|
|
1839
|
+
yield f"event: error\n"
|
|
1840
|
+
yield f"data: {json.dumps(error_data)}\n\n"
|
|
1841
|
+
break
|
|
1842
|
+
|
|
1843
|
+
# Check for new session messages
|
|
1844
|
+
session_result = (
|
|
1845
|
+
client.table("sessions")
|
|
1846
|
+
.select("messages")
|
|
1847
|
+
.eq("execution_id", execution_id)
|
|
1848
|
+
.single()
|
|
1849
|
+
.execute()
|
|
1850
|
+
)
|
|
1851
|
+
|
|
1852
|
+
if session_result.data:
|
|
1853
|
+
db_messages = session_result.data.get("messages", [])
|
|
1854
|
+
if len(db_messages) > last_message_count:
|
|
1855
|
+
new_messages = db_messages[last_message_count:]
|
|
1856
|
+
for msg_dict in new_messages:
|
|
1857
|
+
yield f"event: message\n"
|
|
1858
|
+
yield f"data: {json.dumps(msg_dict)}\n\n"
|
|
1859
|
+
last_message_count = len(db_messages)
|
|
1860
|
+
|
|
1861
|
+
logger.info(
|
|
1862
|
+
"database_messages_update",
|
|
1863
|
+
execution_id=execution_id,
|
|
1864
|
+
new_messages=len(new_messages)
|
|
1865
|
+
)
|
|
1866
|
+
|
|
1867
|
+
last_db_poll = current_time
|
|
1868
|
+
|
|
1869
|
+
except Exception as db_poll_error:
|
|
1870
|
+
logger.error(
|
|
1871
|
+
"database_poll_failed",
|
|
1872
|
+
execution_id=execution_id,
|
|
1873
|
+
error=str(db_poll_error)
|
|
1874
|
+
)
|
|
1875
|
+
else:
|
|
1876
|
+
# Still trying to connect to worker - log but don't switch modes yet
|
|
1877
|
+
logger.debug(
|
|
1878
|
+
"workflow_query_failed",
|
|
1879
|
+
execution_id=execution_id,
|
|
1880
|
+
failures=consecutive_failures,
|
|
1881
|
+
error=error_msg
|
|
1882
|
+
)
|
|
1883
|
+
|
|
1884
|
+
# Poll every 200ms for real-time updates when worker is up
|
|
1885
|
+
# Poll every 500ms when in worker down mode (database polling)
|
|
1886
|
+
await asyncio.sleep(0.5 if worker_down_mode else 0.2)
|
|
1887
|
+
|
|
1888
|
+
except Exception as error:
|
|
1889
|
+
# Critical error (e.g., workflow describe failed)
|
|
1890
|
+
logger.error(
|
|
1891
|
+
"critical_streaming_error",
|
|
1892
|
+
execution_id=execution_id,
|
|
1893
|
+
error=str(error)
|
|
1894
|
+
)
|
|
1895
|
+
# Back off and retry
|
|
1896
|
+
await asyncio.sleep(1.0)
|
|
1897
|
+
|
|
1898
|
+
except Exception as e:
|
|
1899
|
+
logger.error("execution_stream_error", error=str(e), execution_id=execution_id)
|
|
1900
|
+
yield f"event: error\n"
|
|
1901
|
+
yield f"data: {json.dumps({'error': str(e)})}\n\n"
|
|
1902
|
+
|
|
1903
|
+
return StreamingResponse(
|
|
1904
|
+
generate_sse(),
|
|
1905
|
+
media_type="text/event-stream",
|
|
1906
|
+
headers={
|
|
1907
|
+
"Cache-Control": "no-cache",
|
|
1908
|
+
"Connection": "keep-alive",
|
|
1909
|
+
"X-Accel-Buffering": "no", # Disable nginx buffering
|
|
1910
|
+
}
|
|
1911
|
+
)
|