kubiya-control-plane-api 0.1.0__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubiya-control-plane-api might be problematic. Click here for more details.

Files changed (185) hide show
  1. control_plane_api/README.md +266 -0
  2. control_plane_api/__init__.py +0 -0
  3. control_plane_api/__version__.py +1 -0
  4. control_plane_api/alembic/README +1 -0
  5. control_plane_api/alembic/env.py +98 -0
  6. control_plane_api/alembic/script.py.mako +28 -0
  7. control_plane_api/alembic/versions/1382bec74309_initial_migration_with_all_models.py +251 -0
  8. control_plane_api/alembic/versions/1f54bc2a37e3_add_analytics_tables.py +162 -0
  9. control_plane_api/alembic/versions/2e4cb136dc10_rename_toolset_ids_to_skill_ids_in_teams.py +30 -0
  10. control_plane_api/alembic/versions/31cd69a644ce_add_skill_templates_table.py +28 -0
  11. control_plane_api/alembic/versions/89e127caa47d_add_jobs_and_job_executions_tables.py +161 -0
  12. control_plane_api/alembic/versions/add_llm_models_table.py +51 -0
  13. control_plane_api/alembic/versions/b0e10697f212_add_runtime_column_to_teams_simple.py +42 -0
  14. control_plane_api/alembic/versions/ce43b24b63bf_add_execution_trigger_source_and_fix_.py +155 -0
  15. control_plane_api/alembic/versions/d4eaf16e3f8d_rename_toolsets_to_skills.py +84 -0
  16. control_plane_api/alembic/versions/efa2dc427da1_rename_metadata_to_custom_metadata.py +32 -0
  17. control_plane_api/alembic/versions/f973b431d1ce_add_workflow_executor_to_skill_types.py +44 -0
  18. control_plane_api/alembic.ini +148 -0
  19. control_plane_api/api/index.py +12 -0
  20. control_plane_api/app/__init__.py +11 -0
  21. control_plane_api/app/activities/__init__.py +20 -0
  22. control_plane_api/app/activities/agent_activities.py +379 -0
  23. control_plane_api/app/activities/team_activities.py +410 -0
  24. control_plane_api/app/activities/temporal_cloud_activities.py +577 -0
  25. control_plane_api/app/config/__init__.py +35 -0
  26. control_plane_api/app/config/api_config.py +354 -0
  27. control_plane_api/app/config/model_pricing.py +318 -0
  28. control_plane_api/app/config.py +95 -0
  29. control_plane_api/app/database.py +135 -0
  30. control_plane_api/app/exceptions.py +408 -0
  31. control_plane_api/app/lib/__init__.py +11 -0
  32. control_plane_api/app/lib/job_executor.py +312 -0
  33. control_plane_api/app/lib/kubiya_client.py +235 -0
  34. control_plane_api/app/lib/litellm_pricing.py +166 -0
  35. control_plane_api/app/lib/planning_tools/__init__.py +22 -0
  36. control_plane_api/app/lib/planning_tools/agents.py +155 -0
  37. control_plane_api/app/lib/planning_tools/base.py +189 -0
  38. control_plane_api/app/lib/planning_tools/environments.py +214 -0
  39. control_plane_api/app/lib/planning_tools/resources.py +240 -0
  40. control_plane_api/app/lib/planning_tools/teams.py +198 -0
  41. control_plane_api/app/lib/policy_enforcer_client.py +939 -0
  42. control_plane_api/app/lib/redis_client.py +436 -0
  43. control_plane_api/app/lib/supabase.py +71 -0
  44. control_plane_api/app/lib/temporal_client.py +138 -0
  45. control_plane_api/app/lib/validation/__init__.py +20 -0
  46. control_plane_api/app/lib/validation/runtime_validation.py +287 -0
  47. control_plane_api/app/main.py +128 -0
  48. control_plane_api/app/middleware/__init__.py +8 -0
  49. control_plane_api/app/middleware/auth.py +513 -0
  50. control_plane_api/app/middleware/exception_handler.py +267 -0
  51. control_plane_api/app/middleware/rate_limiting.py +384 -0
  52. control_plane_api/app/middleware/request_id.py +202 -0
  53. control_plane_api/app/models/__init__.py +27 -0
  54. control_plane_api/app/models/agent.py +79 -0
  55. control_plane_api/app/models/analytics.py +206 -0
  56. control_plane_api/app/models/associations.py +81 -0
  57. control_plane_api/app/models/environment.py +63 -0
  58. control_plane_api/app/models/execution.py +93 -0
  59. control_plane_api/app/models/job.py +179 -0
  60. control_plane_api/app/models/llm_model.py +75 -0
  61. control_plane_api/app/models/presence.py +49 -0
  62. control_plane_api/app/models/project.py +47 -0
  63. control_plane_api/app/models/session.py +38 -0
  64. control_plane_api/app/models/team.py +66 -0
  65. control_plane_api/app/models/workflow.py +55 -0
  66. control_plane_api/app/policies/README.md +121 -0
  67. control_plane_api/app/policies/approved_users.rego +62 -0
  68. control_plane_api/app/policies/business_hours.rego +51 -0
  69. control_plane_api/app/policies/rate_limiting.rego +100 -0
  70. control_plane_api/app/policies/tool_restrictions.rego +86 -0
  71. control_plane_api/app/routers/__init__.py +4 -0
  72. control_plane_api/app/routers/agents.py +364 -0
  73. control_plane_api/app/routers/agents_v2.py +1260 -0
  74. control_plane_api/app/routers/analytics.py +1014 -0
  75. control_plane_api/app/routers/context_manager.py +562 -0
  76. control_plane_api/app/routers/environment_context.py +270 -0
  77. control_plane_api/app/routers/environments.py +715 -0
  78. control_plane_api/app/routers/execution_environment.py +517 -0
  79. control_plane_api/app/routers/executions.py +1911 -0
  80. control_plane_api/app/routers/health.py +92 -0
  81. control_plane_api/app/routers/health_v2.py +326 -0
  82. control_plane_api/app/routers/integrations.py +274 -0
  83. control_plane_api/app/routers/jobs.py +1344 -0
  84. control_plane_api/app/routers/models.py +82 -0
  85. control_plane_api/app/routers/models_v2.py +361 -0
  86. control_plane_api/app/routers/policies.py +639 -0
  87. control_plane_api/app/routers/presence.py +234 -0
  88. control_plane_api/app/routers/projects.py +902 -0
  89. control_plane_api/app/routers/runners.py +379 -0
  90. control_plane_api/app/routers/runtimes.py +172 -0
  91. control_plane_api/app/routers/secrets.py +155 -0
  92. control_plane_api/app/routers/skills.py +1001 -0
  93. control_plane_api/app/routers/skills_definitions.py +140 -0
  94. control_plane_api/app/routers/task_planning.py +1256 -0
  95. control_plane_api/app/routers/task_queues.py +654 -0
  96. control_plane_api/app/routers/team_context.py +270 -0
  97. control_plane_api/app/routers/teams.py +1400 -0
  98. control_plane_api/app/routers/worker_queues.py +1545 -0
  99. control_plane_api/app/routers/workers.py +935 -0
  100. control_plane_api/app/routers/workflows.py +204 -0
  101. control_plane_api/app/runtimes/__init__.py +6 -0
  102. control_plane_api/app/runtimes/validation.py +344 -0
  103. control_plane_api/app/schemas/job_schemas.py +295 -0
  104. control_plane_api/app/services/__init__.py +1 -0
  105. control_plane_api/app/services/agno_service.py +619 -0
  106. control_plane_api/app/services/litellm_service.py +190 -0
  107. control_plane_api/app/services/policy_service.py +525 -0
  108. control_plane_api/app/services/temporal_cloud_provisioning.py +150 -0
  109. control_plane_api/app/skills/__init__.py +44 -0
  110. control_plane_api/app/skills/base.py +229 -0
  111. control_plane_api/app/skills/business_intelligence.py +189 -0
  112. control_plane_api/app/skills/data_visualization.py +154 -0
  113. control_plane_api/app/skills/docker.py +104 -0
  114. control_plane_api/app/skills/file_generation.py +94 -0
  115. control_plane_api/app/skills/file_system.py +110 -0
  116. control_plane_api/app/skills/python.py +92 -0
  117. control_plane_api/app/skills/registry.py +65 -0
  118. control_plane_api/app/skills/shell.py +102 -0
  119. control_plane_api/app/skills/workflow_executor.py +469 -0
  120. control_plane_api/app/utils/workflow_executor.py +354 -0
  121. control_plane_api/app/workflows/__init__.py +11 -0
  122. control_plane_api/app/workflows/agent_execution.py +507 -0
  123. control_plane_api/app/workflows/agent_execution_with_skills.py +222 -0
  124. control_plane_api/app/workflows/namespace_provisioning.py +326 -0
  125. control_plane_api/app/workflows/team_execution.py +399 -0
  126. control_plane_api/scripts/seed_models.py +239 -0
  127. control_plane_api/worker/__init__.py +0 -0
  128. control_plane_api/worker/activities/__init__.py +0 -0
  129. control_plane_api/worker/activities/agent_activities.py +1241 -0
  130. control_plane_api/worker/activities/approval_activities.py +234 -0
  131. control_plane_api/worker/activities/runtime_activities.py +388 -0
  132. control_plane_api/worker/activities/skill_activities.py +267 -0
  133. control_plane_api/worker/activities/team_activities.py +1217 -0
  134. control_plane_api/worker/config/__init__.py +31 -0
  135. control_plane_api/worker/config/worker_config.py +275 -0
  136. control_plane_api/worker/control_plane_client.py +529 -0
  137. control_plane_api/worker/examples/analytics_integration_example.py +362 -0
  138. control_plane_api/worker/models/__init__.py +1 -0
  139. control_plane_api/worker/models/inputs.py +89 -0
  140. control_plane_api/worker/runtimes/__init__.py +31 -0
  141. control_plane_api/worker/runtimes/base.py +789 -0
  142. control_plane_api/worker/runtimes/claude_code_runtime.py +1443 -0
  143. control_plane_api/worker/runtimes/default_runtime.py +617 -0
  144. control_plane_api/worker/runtimes/factory.py +173 -0
  145. control_plane_api/worker/runtimes/validation.py +93 -0
  146. control_plane_api/worker/services/__init__.py +1 -0
  147. control_plane_api/worker/services/agent_executor.py +422 -0
  148. control_plane_api/worker/services/agent_executor_v2.py +383 -0
  149. control_plane_api/worker/services/analytics_collector.py +457 -0
  150. control_plane_api/worker/services/analytics_service.py +464 -0
  151. control_plane_api/worker/services/approval_tools.py +310 -0
  152. control_plane_api/worker/services/approval_tools_agno.py +207 -0
  153. control_plane_api/worker/services/cancellation_manager.py +177 -0
  154. control_plane_api/worker/services/data_visualization.py +827 -0
  155. control_plane_api/worker/services/jira_tools.py +257 -0
  156. control_plane_api/worker/services/runtime_analytics.py +328 -0
  157. control_plane_api/worker/services/session_service.py +194 -0
  158. control_plane_api/worker/services/skill_factory.py +175 -0
  159. control_plane_api/worker/services/team_executor.py +574 -0
  160. control_plane_api/worker/services/team_executor_v2.py +465 -0
  161. control_plane_api/worker/services/workflow_executor_tools.py +1418 -0
  162. control_plane_api/worker/tests/__init__.py +1 -0
  163. control_plane_api/worker/tests/e2e/__init__.py +0 -0
  164. control_plane_api/worker/tests/e2e/test_execution_flow.py +571 -0
  165. control_plane_api/worker/tests/integration/__init__.py +0 -0
  166. control_plane_api/worker/tests/integration/test_control_plane_integration.py +308 -0
  167. control_plane_api/worker/tests/unit/__init__.py +0 -0
  168. control_plane_api/worker/tests/unit/test_control_plane_client.py +401 -0
  169. control_plane_api/worker/utils/__init__.py +1 -0
  170. control_plane_api/worker/utils/chunk_batcher.py +305 -0
  171. control_plane_api/worker/utils/retry_utils.py +60 -0
  172. control_plane_api/worker/utils/streaming_utils.py +373 -0
  173. control_plane_api/worker/worker.py +753 -0
  174. control_plane_api/worker/workflows/__init__.py +0 -0
  175. control_plane_api/worker/workflows/agent_execution.py +589 -0
  176. control_plane_api/worker/workflows/team_execution.py +429 -0
  177. kubiya_control_plane_api-0.3.4.dist-info/METADATA +229 -0
  178. kubiya_control_plane_api-0.3.4.dist-info/RECORD +182 -0
  179. kubiya_control_plane_api-0.3.4.dist-info/entry_points.txt +2 -0
  180. kubiya_control_plane_api-0.3.4.dist-info/top_level.txt +1 -0
  181. kubiya_control_plane_api-0.1.0.dist-info/METADATA +0 -66
  182. kubiya_control_plane_api-0.1.0.dist-info/RECORD +0 -5
  183. kubiya_control_plane_api-0.1.0.dist-info/top_level.txt +0 -1
  184. {kubiya_control_plane_api-0.1.0.dist-info/licenses → control_plane_api}/LICENSE +0 -0
  185. {kubiya_control_plane_api-0.1.0.dist-info → kubiya_control_plane_api-0.3.4.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1911 @@
1
+ """
2
+ Multi-tenant executions router with Supabase.
3
+
4
+ This router handles execution queries for the authenticated organization.
5
+ Uses Supabase directly to avoid SQLAlchemy enum validation issues.
6
+ """
7
+
8
+ from fastapi import APIRouter, Depends, HTTPException, status, Request
9
+ from fastapi.responses import StreamingResponse
10
+ from typing import List, Optional
11
+ from datetime import datetime
12
+ from pydantic import BaseModel, Field
13
+ import structlog
14
+ import asyncio
15
+ import json
16
+
17
+ from control_plane_api.app.middleware.auth import get_current_organization
18
+ from control_plane_api.app.lib.supabase import get_supabase
19
+ from control_plane_api.app.lib.temporal_client import get_temporal_client
20
+ from control_plane_api.app.lib.redis_client import get_redis_client
21
+ from control_plane_api.app.workflows.agent_execution import AgentExecutionWorkflow
22
+ from control_plane_api.app.services.agno_service import agno_service
23
+
24
+ logger = structlog.get_logger()
25
+
26
+ router = APIRouter()
27
+
28
+
29
+ # Pydantic schemas
30
+ class ParticipantResponse(BaseModel):
31
+ """Participant in an execution"""
32
+ id: str
33
+ user_id: str
34
+ user_name: str | None
35
+ user_email: str | None
36
+ user_avatar: str | None
37
+ role: str
38
+ joined_at: str
39
+ last_active_at: str
40
+
41
+
42
+ class ExecutionResponse(BaseModel):
43
+ id: str
44
+ organization_id: str
45
+ execution_type: str
46
+ entity_id: str
47
+ entity_name: str | None
48
+ prompt: str
49
+ system_prompt: str | None
50
+ status: str
51
+ response: str | None
52
+ error_message: str | None
53
+ usage: dict
54
+ execution_metadata: dict
55
+ runner_name: str | None
56
+ user_id: str | None
57
+ user_name: str | None
58
+ user_email: str | None
59
+ user_avatar: str | None
60
+ created_at: str
61
+ started_at: str | None
62
+ completed_at: str | None
63
+ updated_at: str
64
+ participants: List[ParticipantResponse] = Field(default_factory=list)
65
+
66
+
67
+ @router.get("", response_model=List[ExecutionResponse])
68
+ async def list_executions(
69
+ request: Request,
70
+ skip: int = 0,
71
+ limit: int = 100,
72
+ status_filter: str | None = None,
73
+ execution_type: str | None = None,
74
+ organization: dict = Depends(get_current_organization),
75
+ ):
76
+ """List all executions for the organization with optional filtering"""
77
+ try:
78
+ client = get_supabase()
79
+
80
+ # Query executions for this organization with participants
81
+ query = client.table("executions").select("*, execution_participants(*)").eq("organization_id", organization["id"])
82
+
83
+ if status_filter:
84
+ query = query.eq("status", status_filter.lower()) # Normalize to lowercase
85
+ if execution_type:
86
+ query = query.eq("execution_type", execution_type.upper())
87
+
88
+ query = query.order("created_at", desc=True).range(skip, skip + limit - 1)
89
+
90
+ result = query.execute()
91
+
92
+ if not result or not result.data:
93
+ logger.info("no_executions_found", org_id=organization["id"])
94
+ return []
95
+
96
+ executions = []
97
+ for execution in result.data:
98
+ try:
99
+ # Parse participants
100
+ participants_data = execution.get("execution_participants", [])
101
+ participants = []
102
+ for p in participants_data:
103
+ try:
104
+ participants.append(ParticipantResponse(
105
+ id=p["id"],
106
+ user_id=p["user_id"],
107
+ user_name=p.get("user_name"),
108
+ user_email=p.get("user_email"),
109
+ user_avatar=p.get("user_avatar"),
110
+ role=p["role"],
111
+ joined_at=p["joined_at"],
112
+ last_active_at=p["last_active_at"],
113
+ ))
114
+ except Exception as participant_error:
115
+ logger.warning("failed_to_parse_participant", error=str(participant_error), execution_id=execution.get("id"))
116
+ # Skip invalid participant, continue with others
117
+
118
+ executions.append(
119
+ ExecutionResponse(
120
+ id=execution["id"],
121
+ organization_id=execution["organization_id"],
122
+ execution_type=execution["execution_type"],
123
+ entity_id=execution["entity_id"],
124
+ entity_name=execution.get("entity_name"),
125
+ prompt=execution.get("prompt", ""),
126
+ system_prompt=execution.get("system_prompt"),
127
+ status=execution["status"],
128
+ response=execution.get("response"),
129
+ error_message=execution.get("error_message"),
130
+ usage=execution.get("usage", {}),
131
+ execution_metadata=execution.get("execution_metadata", {}),
132
+ runner_name=execution.get("runner_name"),
133
+ user_id=execution.get("user_id"),
134
+ user_name=execution.get("user_name"),
135
+ user_email=execution.get("user_email"),
136
+ user_avatar=execution.get("user_avatar"),
137
+ created_at=execution["created_at"],
138
+ started_at=execution.get("started_at"),
139
+ completed_at=execution.get("completed_at"),
140
+ updated_at=execution["updated_at"],
141
+ participants=participants,
142
+ )
143
+ )
144
+ except Exception as execution_error:
145
+ logger.error("failed_to_parse_execution", error=str(execution_error), execution_id=execution.get("id"))
146
+ # Skip invalid execution, continue with others
147
+
148
+ logger.info(
149
+ "executions_listed_successfully",
150
+ count=len(executions),
151
+ org_id=organization["id"],
152
+ )
153
+
154
+ return executions
155
+
156
+ except HTTPException:
157
+ raise
158
+ except Exception as e:
159
+ logger.error(
160
+ "executions_list_failed",
161
+ error=str(e),
162
+ error_type=type(e).__name__,
163
+ org_id=organization["id"]
164
+ )
165
+ raise HTTPException(
166
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
167
+ detail=f"Failed to list executions: {str(e)}"
168
+ )
169
+
170
+
171
+ @router.get("/{execution_id}", response_model=ExecutionResponse)
172
+ async def get_execution(
173
+ execution_id: str,
174
+ request: Request,
175
+ organization: dict = Depends(get_current_organization),
176
+ ):
177
+ """Get a specific execution by ID"""
178
+ try:
179
+ client = get_supabase()
180
+
181
+ result = (
182
+ client.table("executions")
183
+ .select("*")
184
+ .eq("id", execution_id)
185
+ .eq("organization_id", organization["id"])
186
+ .single()
187
+ .execute()
188
+ )
189
+
190
+ if not result.data:
191
+ raise HTTPException(status_code=404, detail="Execution not found")
192
+
193
+ execution_data = result.data
194
+
195
+ return ExecutionResponse(
196
+ id=execution_data["id"],
197
+ organization_id=execution_data["organization_id"],
198
+ execution_type=execution_data["execution_type"],
199
+ entity_id=execution_data["entity_id"],
200
+ entity_name=execution_data.get("entity_name"),
201
+ prompt=execution_data.get("prompt", ""),
202
+ system_prompt=execution_data.get("system_prompt"),
203
+ status=execution_data["status"],
204
+ response=execution_data.get("response"),
205
+ error_message=execution_data.get("error_message"),
206
+ usage=execution_data.get("usage", {}),
207
+ execution_metadata=execution_data.get("execution_metadata", {}),
208
+ runner_name=execution_data.get("runner_name"),
209
+ user_id=execution_data.get("user_id"),
210
+ user_name=execution_data.get("user_name"),
211
+ user_email=execution_data.get("user_email"),
212
+ user_avatar=execution_data.get("user_avatar"),
213
+ created_at=execution_data["created_at"],
214
+ started_at=execution_data.get("started_at"),
215
+ completed_at=execution_data.get("completed_at"),
216
+ updated_at=execution_data["updated_at"],
217
+ )
218
+
219
+ except HTTPException:
220
+ raise
221
+ except Exception as e:
222
+ logger.error("execution_get_failed", error=str(e), execution_id=execution_id)
223
+ raise HTTPException(
224
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
225
+ detail=f"Failed to get execution: {str(e)}"
226
+ )
227
+
228
+
229
+ @router.delete("/{execution_id}", status_code=status.HTTP_204_NO_CONTENT)
230
+ async def delete_execution(
231
+ execution_id: str,
232
+ request: Request,
233
+ organization: dict = Depends(get_current_organization),
234
+ ):
235
+ """Delete an execution"""
236
+ try:
237
+ client = get_supabase()
238
+
239
+ result = (
240
+ client.table("executions")
241
+ .delete()
242
+ .eq("id", execution_id)
243
+ .eq("organization_id", organization["id"])
244
+ .execute()
245
+ )
246
+
247
+ if not result.data:
248
+ raise HTTPException(status_code=404, detail="Execution not found")
249
+
250
+ logger.info("execution_deleted", execution_id=execution_id, org_id=organization["id"])
251
+
252
+ return None
253
+
254
+ except HTTPException:
255
+ raise
256
+ except Exception as e:
257
+ logger.error("execution_delete_failed", error=str(e), execution_id=execution_id)
258
+ raise HTTPException(
259
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
260
+ detail=f"Failed to delete execution: {str(e)}"
261
+ )
262
+
263
+
264
+ class ExecutionUpdate(BaseModel):
265
+ """Update execution fields - used by workers to update execution status"""
266
+ status: str | None = None
267
+ started_at: str | None = None
268
+ completed_at: str | None = None
269
+ response: str | None = None
270
+ error_message: str | None = None
271
+ usage: dict | None = None
272
+ execution_metadata: dict | None = None
273
+
274
+
275
+ @router.patch("/{execution_id}", response_model=ExecutionResponse)
276
+ async def update_execution(
277
+ execution_id: str,
278
+ execution_update: ExecutionUpdate,
279
+ request: Request,
280
+ organization: dict = Depends(get_current_organization),
281
+ ):
282
+ """
283
+ Update execution status and results.
284
+
285
+ This endpoint is primarily used by workers to update execution status,
286
+ results, usage metrics, and metadata during execution.
287
+ """
288
+ try:
289
+ client = get_supabase()
290
+
291
+ # Build update dict - only include provided fields
292
+ update_data = {}
293
+
294
+ if execution_update.status is not None:
295
+ update_data["status"] = execution_update.status.lower() # Normalize to lowercase
296
+
297
+ if execution_update.started_at is not None:
298
+ update_data["started_at"] = execution_update.started_at
299
+
300
+ if execution_update.completed_at is not None:
301
+ update_data["completed_at"] = execution_update.completed_at
302
+
303
+ if execution_update.response is not None:
304
+ update_data["response"] = execution_update.response
305
+
306
+ if execution_update.error_message is not None:
307
+ update_data["error_message"] = execution_update.error_message
308
+
309
+ if execution_update.usage is not None:
310
+ update_data["usage"] = execution_update.usage
311
+
312
+ if execution_update.execution_metadata is not None:
313
+ update_data["execution_metadata"] = execution_update.execution_metadata
314
+
315
+ # Always update updated_at
316
+ update_data["updated_at"] = datetime.utcnow().isoformat()
317
+
318
+ # Update execution
319
+ result = (
320
+ client.table("executions")
321
+ .update(update_data)
322
+ .eq("id", execution_id)
323
+ .eq("organization_id", organization["id"])
324
+ .execute()
325
+ )
326
+
327
+ if not result.data:
328
+ raise HTTPException(status_code=404, detail="Execution not found")
329
+
330
+ execution_data = result.data[0]
331
+
332
+ logger.info(
333
+ "execution_updated",
334
+ execution_id=execution_id,
335
+ org_id=organization["id"],
336
+ fields_updated=list(update_data.keys()),
337
+ )
338
+
339
+ return ExecutionResponse(
340
+ id=execution_data["id"],
341
+ organization_id=execution_data["organization_id"],
342
+ execution_type=execution_data["execution_type"],
343
+ entity_id=execution_data["entity_id"],
344
+ entity_name=execution_data.get("entity_name"),
345
+ prompt=execution_data.get("prompt", ""),
346
+ system_prompt=execution_data.get("system_prompt"),
347
+ status=execution_data["status"],
348
+ response=execution_data.get("response"),
349
+ error_message=execution_data.get("error_message"),
350
+ usage=execution_data.get("usage", {}),
351
+ execution_metadata=execution_data.get("execution_metadata", {}),
352
+ runner_name=execution_data.get("runner_name"),
353
+ user_id=execution_data.get("user_id"),
354
+ user_name=execution_data.get("user_name"),
355
+ user_email=execution_data.get("user_email"),
356
+ user_avatar=execution_data.get("user_avatar"),
357
+ created_at=execution_data["created_at"],
358
+ started_at=execution_data.get("started_at"),
359
+ completed_at=execution_data.get("completed_at"),
360
+ updated_at=execution_data["updated_at"],
361
+ )
362
+
363
+ except HTTPException:
364
+ raise
365
+ except Exception as e:
366
+ logger.error("execution_update_failed", error=str(e), execution_id=execution_id)
367
+ raise HTTPException(
368
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
369
+ detail=f"Failed to update execution: {str(e)}"
370
+ )
371
+
372
+
373
+ class SendMessageRequest(BaseModel):
374
+ """Request to send a message to a running execution"""
375
+ message: str
376
+ role: str = "user" # user, system, etc.
377
+
378
+
379
+ @router.post("/{execution_id}/message", status_code=status.HTTP_202_ACCEPTED)
380
+ async def send_message_to_execution(
381
+ execution_id: str,
382
+ request_body: SendMessageRequest,
383
+ request: Request,
384
+ organization: dict = Depends(get_current_organization),
385
+ ):
386
+ """
387
+ Send a followup message to a running execution using Temporal signals.
388
+
389
+ This sends a signal to the Temporal workflow, adding the message to the conversation.
390
+ The workflow will process the message and respond accordingly.
391
+ """
392
+ try:
393
+ # Get Temporal client
394
+ temporal_client = await get_temporal_client()
395
+
396
+ # Verify the execution belongs to this organization and get execution type
397
+ client = get_supabase()
398
+ result = (
399
+ client.table("executions")
400
+ .select("id, organization_id, status, execution_type")
401
+ .eq("id", execution_id)
402
+ .eq("organization_id", organization["id"])
403
+ .single()
404
+ .execute()
405
+ )
406
+
407
+ if not result.data:
408
+ raise HTTPException(status_code=404, detail="Execution not found")
409
+
410
+ # Construct workflow ID based on execution type
411
+ execution_type = result.data.get("execution_type", "AGENT")
412
+ if execution_type == "TEAM":
413
+ workflow_id = f"team-execution-{execution_id}"
414
+ else:
415
+ workflow_id = f"agent-execution-{execution_id}"
416
+
417
+ workflow_handle = temporal_client.get_workflow_handle(workflow_id)
418
+
419
+ # Import ChatMessage from workflow
420
+ from control_plane_api.app.workflows.agent_execution import ChatMessage
421
+ from datetime import datetime, timezone
422
+
423
+ # Create the message with user attribution from JWT token
424
+ message = ChatMessage(
425
+ role=request_body.role,
426
+ content=request_body.message,
427
+ timestamp=datetime.now(timezone.utc).isoformat(),
428
+ user_id=organization.get("user_id"),
429
+ user_name=organization.get("user_name"),
430
+ user_email=organization.get("user_email"),
431
+ user_avatar=organization.get("user_avatar"), # Now available from JWT via auth middleware
432
+ )
433
+
434
+ # Send signal to workflow
435
+ await workflow_handle.signal(AgentExecutionWorkflow.add_message, message)
436
+
437
+ # Add user as participant if not already added (multiplayer support)
438
+ user_id = organization.get("user_id")
439
+ if user_id:
440
+ try:
441
+ # Check if participant already exists
442
+ existing = (
443
+ client.table("execution_participants")
444
+ .select("id")
445
+ .eq("execution_id", execution_id)
446
+ .eq("user_id", user_id)
447
+ .execute()
448
+ )
449
+
450
+ if not existing.data or len(existing.data) == 0:
451
+ # Add as new participant (collaborator role)
452
+ import uuid
453
+ client.table("execution_participants").insert({
454
+ "id": str(uuid.uuid4()),
455
+ "execution_id": execution_id,
456
+ "organization_id": organization["id"],
457
+ "user_id": user_id,
458
+ "user_name": organization.get("user_name"),
459
+ "user_email": organization.get("user_email"),
460
+ "user_avatar": organization.get("user_avatar"),
461
+ "role": "collaborator",
462
+ }).execute()
463
+ logger.info(
464
+ "participant_added",
465
+ execution_id=execution_id,
466
+ user_id=user_id,
467
+ )
468
+ else:
469
+ # Update last_active_at for existing participant
470
+ client.table("execution_participants").update({
471
+ "last_active_at": datetime.now(timezone.utc).isoformat(),
472
+ }).eq("execution_id", execution_id).eq("user_id", user_id).execute()
473
+ except Exception as participant_error:
474
+ logger.warning(
475
+ "failed_to_add_participant",
476
+ error=str(participant_error),
477
+ execution_id=execution_id,
478
+ )
479
+ # Don't fail the whole request if participant tracking fails
480
+
481
+ logger.info(
482
+ "message_sent_to_execution",
483
+ execution_id=execution_id,
484
+ org_id=organization["id"],
485
+ role=request_body.role,
486
+ )
487
+
488
+ return {
489
+ "success": True,
490
+ "execution_id": execution_id,
491
+ "message": "Message sent to workflow",
492
+ }
493
+
494
+ except HTTPException:
495
+ raise
496
+ except Exception as e:
497
+ logger.error(
498
+ "send_message_failed",
499
+ error=str(e),
500
+ execution_id=execution_id,
501
+ )
502
+ raise HTTPException(
503
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
504
+ detail=f"Failed to send message: {str(e)}"
505
+ )
506
+
507
+
508
+ @router.post("/{execution_id}/pause")
509
+ async def pause_execution(
510
+ execution_id: str,
511
+ request: Request,
512
+ organization: dict = Depends(get_current_organization),
513
+ ):
514
+ """
515
+ Pause an active execution by sending a signal to the Temporal workflow.
516
+
517
+ This is triggered when the user clicks the PAUSE button in the UI.
518
+ The workflow will stop processing but remain active and can be resumed.
519
+ """
520
+ try:
521
+ logger.info(
522
+ "pause_execution_requested",
523
+ execution_id=execution_id,
524
+ org_id=organization["id"]
525
+ )
526
+
527
+ # Get execution from Supabase
528
+ client = get_supabase()
529
+ result = (
530
+ client.table("executions")
531
+ .select("id, status, execution_type")
532
+ .eq("id", execution_id)
533
+ .eq("organization_id", organization["id"])
534
+ .single()
535
+ .execute()
536
+ )
537
+
538
+ if not result.data:
539
+ raise HTTPException(
540
+ status_code=status.HTTP_404_NOT_FOUND,
541
+ detail="Execution not found"
542
+ )
543
+
544
+ execution = result.data
545
+ current_status = execution["status"]
546
+
547
+ # Check if execution can be paused
548
+ if current_status not in ["running", "waiting_for_input"]:
549
+ logger.warning(
550
+ "pause_execution_invalid_status",
551
+ execution_id=execution_id,
552
+ status=current_status
553
+ )
554
+ return {
555
+ "success": False,
556
+ "error": f"Execution cannot be paused (status: {current_status})",
557
+ "execution_id": execution_id,
558
+ "status": current_status,
559
+ }
560
+
561
+ # Get Temporal client
562
+ temporal_client = await get_temporal_client()
563
+
564
+ # Determine workflow ID based on execution type
565
+ execution_type = execution.get("execution_type", "AGENT")
566
+ workflow_id = f"team-execution-{execution_id}" if execution_type == "TEAM" else f"agent-execution-{execution_id}"
567
+
568
+ workflow_handle = temporal_client.get_workflow_handle(workflow_id)
569
+
570
+ # Send pause signal to workflow
571
+ await workflow_handle.signal(AgentExecutionWorkflow.pause_execution)
572
+
573
+ # Update execution status to paused in Supabase
574
+ (
575
+ client.table("executions")
576
+ .update({
577
+ "status": "paused",
578
+ "updated_at": datetime.utcnow().isoformat(),
579
+ })
580
+ .eq("id", execution_id)
581
+ .eq("organization_id", organization["id"])
582
+ .execute()
583
+ )
584
+
585
+ # Emit system message to Redis for UI display
586
+ redis_client = get_redis_client()
587
+ if redis_client:
588
+ try:
589
+ import time
590
+ user_name = organization.get("user_name", "User")
591
+ current_timestamp = datetime.utcnow().isoformat()
592
+ message_id = f"{execution_id}_pause_{int(time.time() * 1000000)}"
593
+
594
+ # Create message event - format matches what streaming endpoint expects
595
+ pause_message_event = {
596
+ "event_type": "message",
597
+ "data": {
598
+ "role": "system",
599
+ "content": f"⏸️ Execution paused by {user_name}",
600
+ "timestamp": current_timestamp,
601
+ "message_id": message_id,
602
+ },
603
+ "timestamp": current_timestamp,
604
+ "execution_id": execution_id,
605
+ }
606
+
607
+ redis_key = f"execution:{execution_id}:events"
608
+ await redis_client.lpush(redis_key, json.dumps(pause_message_event))
609
+ await redis_client.ltrim(redis_key, 0, 999)
610
+ await redis_client.expire(redis_key, 3600)
611
+
612
+ # Also update status event
613
+ status_event = {
614
+ "event_type": "status",
615
+ "data": {"status": "paused", "execution_id": execution_id},
616
+ "timestamp": current_timestamp,
617
+ "execution_id": execution_id,
618
+ }
619
+ await redis_client.lpush(redis_key, json.dumps(status_event))
620
+
621
+ logger.debug("pause_event_published_to_redis", execution_id=execution_id)
622
+ except Exception as redis_error:
623
+ logger.warning("failed_to_publish_pause_event", error=str(redis_error), execution_id=execution_id)
624
+
625
+ logger.info(
626
+ "execution_paused_successfully",
627
+ execution_id=execution_id,
628
+ workflow_id=workflow_id
629
+ )
630
+
631
+ return {
632
+ "success": True,
633
+ "execution_id": execution_id,
634
+ "workflow_id": workflow_id,
635
+ "message": "Execution paused successfully",
636
+ }
637
+
638
+ except HTTPException:
639
+ raise
640
+ except Exception as e:
641
+ logger.error(
642
+ "pause_execution_error",
643
+ execution_id=execution_id,
644
+ error=str(e)
645
+ )
646
+ raise HTTPException(
647
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
648
+ detail=f"Failed to pause execution: {str(e)}"
649
+ )
650
+
651
+
652
+ @router.post("/{execution_id}/resume")
653
+ async def resume_execution(
654
+ execution_id: str,
655
+ request: Request,
656
+ organization: dict = Depends(get_current_organization),
657
+ ):
658
+ """
659
+ Resume a paused execution by sending a signal to the Temporal workflow.
660
+
661
+ This is triggered when the user clicks the RESUME button in the UI.
662
+ The workflow will continue processing from where it was paused.
663
+ """
664
+ try:
665
+ logger.info(
666
+ "resume_execution_requested",
667
+ execution_id=execution_id,
668
+ org_id=organization["id"]
669
+ )
670
+
671
+ # Get execution from Supabase
672
+ client = get_supabase()
673
+ result = (
674
+ client.table("executions")
675
+ .select("id, status, execution_type")
676
+ .eq("id", execution_id)
677
+ .eq("organization_id", organization["id"])
678
+ .single()
679
+ .execute()
680
+ )
681
+
682
+ if not result.data:
683
+ raise HTTPException(
684
+ status_code=status.HTTP_404_NOT_FOUND,
685
+ detail="Execution not found"
686
+ )
687
+
688
+ execution = result.data
689
+ current_status = execution["status"]
690
+
691
+ # Check if execution is paused
692
+ if current_status != "paused":
693
+ logger.warning(
694
+ "resume_execution_not_paused",
695
+ execution_id=execution_id,
696
+ status=current_status
697
+ )
698
+ return {
699
+ "success": False,
700
+ "error": f"Execution is not paused (status: {current_status})",
701
+ "execution_id": execution_id,
702
+ "status": current_status,
703
+ }
704
+
705
+ # Get Temporal client
706
+ temporal_client = await get_temporal_client()
707
+
708
+ # Determine workflow ID based on execution type
709
+ execution_type = execution.get("execution_type", "AGENT")
710
+ workflow_id = f"team-execution-{execution_id}" if execution_type == "TEAM" else f"agent-execution-{execution_id}"
711
+
712
+ workflow_handle = temporal_client.get_workflow_handle(workflow_id)
713
+
714
+ # Send resume signal to workflow
715
+ await workflow_handle.signal(AgentExecutionWorkflow.resume_execution)
716
+
717
+ # Update execution status back to running/waiting in Supabase
718
+ # The workflow will determine the correct status
719
+ (
720
+ client.table("executions")
721
+ .update({
722
+ "status": "running", # Workflow will update to correct status
723
+ "updated_at": datetime.utcnow().isoformat(),
724
+ })
725
+ .eq("id", execution_id)
726
+ .eq("organization_id", organization["id"])
727
+ .execute()
728
+ )
729
+
730
+ # Emit system message to Redis for UI display
731
+ redis_client = get_redis_client()
732
+ if redis_client:
733
+ try:
734
+ import time
735
+ user_name = organization.get("user_name", "User")
736
+ current_timestamp = datetime.utcnow().isoformat()
737
+ message_id = f"{execution_id}_resume_{int(time.time() * 1000000)}"
738
+
739
+ # Create message event - format matches what streaming endpoint expects
740
+ resume_message_event = {
741
+ "event_type": "message",
742
+ "data": {
743
+ "role": "system",
744
+ "content": f"▶️ Execution resumed by {user_name}",
745
+ "timestamp": current_timestamp,
746
+ "message_id": message_id,
747
+ },
748
+ "timestamp": current_timestamp,
749
+ "execution_id": execution_id,
750
+ }
751
+
752
+ redis_key = f"execution:{execution_id}:events"
753
+ await redis_client.lpush(redis_key, json.dumps(resume_message_event))
754
+ await redis_client.ltrim(redis_key, 0, 999)
755
+ await redis_client.expire(redis_key, 3600)
756
+
757
+ # Also update status event
758
+ status_event = {
759
+ "event_type": "status",
760
+ "data": {"status": "running", "execution_id": execution_id},
761
+ "timestamp": current_timestamp,
762
+ "execution_id": execution_id,
763
+ }
764
+ await redis_client.lpush(redis_key, json.dumps(status_event))
765
+
766
+ logger.debug("resume_event_published_to_redis", execution_id=execution_id)
767
+ except Exception as redis_error:
768
+ logger.warning("failed_to_publish_resume_event", error=str(redis_error), execution_id=execution_id)
769
+
770
+ logger.info(
771
+ "execution_resumed_successfully",
772
+ execution_id=execution_id,
773
+ workflow_id=workflow_id
774
+ )
775
+
776
+ return {
777
+ "success": True,
778
+ "execution_id": execution_id,
779
+ "workflow_id": workflow_id,
780
+ "message": "Execution resumed successfully",
781
+ }
782
+
783
+ except HTTPException:
784
+ raise
785
+ except Exception as e:
786
+ logger.error(
787
+ "resume_execution_error",
788
+ execution_id=execution_id,
789
+ error=str(e)
790
+ )
791
+ raise HTTPException(
792
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
793
+ detail=f"Failed to resume execution: {str(e)}"
794
+ )
795
+
796
+
797
+ @router.post("/{execution_id}/cancel")
798
+ async def cancel_execution(
799
+ execution_id: str,
800
+ request: Request,
801
+ organization: dict = Depends(get_current_organization),
802
+ ):
803
+ """
804
+ Cancel an active execution by calling Temporal's workflow cancellation.
805
+
806
+ This is triggered when the user clicks the STOP button in the UI.
807
+ It uses Temporal's built-in cancellation which is fast and returns immediately.
808
+ """
809
+ try:
810
+ from temporalio.client import WorkflowHandle
811
+
812
+ logger.info(
813
+ "cancel_execution_requested",
814
+ execution_id=execution_id,
815
+ org_id=organization["id"]
816
+ )
817
+
818
+ # Get execution from Supabase
819
+ client = get_supabase()
820
+ result = (
821
+ client.table("executions")
822
+ .select("id, status, execution_type")
823
+ .eq("id", execution_id)
824
+ .eq("organization_id", organization["id"])
825
+ .single()
826
+ .execute()
827
+ )
828
+
829
+ if not result.data:
830
+ raise HTTPException(
831
+ status_code=status.HTTP_404_NOT_FOUND,
832
+ detail="Execution not found"
833
+ )
834
+
835
+ execution = result.data
836
+ current_status = execution["status"]
837
+
838
+ # Check if execution is still running
839
+ if current_status not in ["running", "waiting_for_input"]:
840
+ logger.warning(
841
+ "cancel_execution_not_running",
842
+ execution_id=execution_id,
843
+ status=current_status
844
+ )
845
+ return {
846
+ "success": False,
847
+ "error": f"Execution is not running (status: {current_status})",
848
+ "execution_id": execution_id,
849
+ "status": current_status,
850
+ }
851
+
852
+ # Get Temporal client
853
+ temporal_client = await get_temporal_client()
854
+
855
+ # Determine workflow ID based on execution type
856
+ execution_type = execution.get("execution_type", "AGENT")
857
+ workflow_id = f"team-execution-{execution_id}" if execution_type == "TEAM" else f"agent-execution-{execution_id}"
858
+
859
+ workflow_handle: WorkflowHandle = temporal_client.get_workflow_handle(
860
+ workflow_id=workflow_id
861
+ )
862
+
863
+ # Use Temporal's built-in workflow cancellation
864
+ # This is fast and returns immediately
865
+ try:
866
+ # Send cancel signal to the workflow
867
+ # This returns immediately - it doesn't wait for the workflow to finish
868
+ await workflow_handle.cancel()
869
+
870
+ # Update execution status to cancelled in Supabase
871
+ update_result = (
872
+ client.table("executions")
873
+ .update({
874
+ "status": "cancelled",
875
+ "completed_at": datetime.utcnow().isoformat(),
876
+ "error_message": "Cancelled by user",
877
+ "updated_at": datetime.utcnow().isoformat(),
878
+ })
879
+ .eq("id", execution_id)
880
+ .eq("organization_id", organization["id"])
881
+ .execute()
882
+ )
883
+
884
+ # Emit system message to Redis for UI display
885
+ redis_client = get_redis_client()
886
+ if redis_client:
887
+ try:
888
+ import time
889
+ user_name = organization.get("user_name", "User")
890
+ current_timestamp = datetime.utcnow().isoformat()
891
+ message_id = f"{execution_id}_cancel_{int(time.time() * 1000000)}"
892
+
893
+ # Create message event - format matches what streaming endpoint expects
894
+ cancel_message_event = {
895
+ "event_type": "message",
896
+ "data": {
897
+ "role": "system",
898
+ "content": f"🛑 Execution stopped by {user_name}",
899
+ "timestamp": current_timestamp,
900
+ "message_id": message_id,
901
+ },
902
+ "timestamp": current_timestamp,
903
+ "execution_id": execution_id,
904
+ }
905
+
906
+ redis_key = f"execution:{execution_id}:events"
907
+ await redis_client.lpush(redis_key, json.dumps(cancel_message_event))
908
+ await redis_client.ltrim(redis_key, 0, 999)
909
+ await redis_client.expire(redis_key, 3600)
910
+
911
+ # Also update status event
912
+ status_event = {
913
+ "event_type": "status",
914
+ "data": {"status": "cancelled", "execution_id": execution_id},
915
+ "timestamp": current_timestamp,
916
+ "execution_id": execution_id,
917
+ }
918
+ await redis_client.lpush(redis_key, json.dumps(status_event))
919
+
920
+ logger.debug("cancel_event_published_to_redis", execution_id=execution_id)
921
+ except Exception as redis_error:
922
+ logger.warning("failed_to_publish_cancel_event", error=str(redis_error), execution_id=execution_id)
923
+
924
+ logger.info(
925
+ "execution_cancelled_successfully",
926
+ execution_id=execution_id,
927
+ workflow_id=workflow_id
928
+ )
929
+
930
+ return {
931
+ "success": True,
932
+ "execution_id": execution_id,
933
+ "workflow_id": workflow_id,
934
+ "message": "Execution cancelled successfully",
935
+ }
936
+
937
+ except Exception as cancel_error:
938
+ logger.error(
939
+ "cancel_workflow_error",
940
+ execution_id=execution_id,
941
+ error=str(cancel_error)
942
+ )
943
+
944
+ # Mark as cancelled in database anyway (user intent matters)
945
+ (
946
+ client.table("executions")
947
+ .update({
948
+ "status": "cancelled",
949
+ "completed_at": datetime.utcnow().isoformat(),
950
+ "error_message": f"Cancelled: {str(cancel_error)}",
951
+ "updated_at": datetime.utcnow().isoformat(),
952
+ })
953
+ .eq("id", execution_id)
954
+ .eq("organization_id", organization["id"])
955
+ .execute()
956
+ )
957
+
958
+ return {
959
+ "success": True, # User intent succeeded
960
+ "execution_id": execution_id,
961
+ "message": "Execution marked as cancelled",
962
+ "warning": str(cancel_error),
963
+ }
964
+
965
+ except HTTPException:
966
+ raise
967
+ except Exception as e:
968
+ logger.error(
969
+ "cancel_execution_error",
970
+ execution_id=execution_id,
971
+ error=str(e)
972
+ )
973
+ raise HTTPException(
974
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
975
+ detail=f"Failed to cancel execution: {str(e)}"
976
+ )
977
+
978
+
979
+ @router.get("/{execution_id}/session")
980
+ async def get_session_history(
981
+ execution_id: str,
982
+ request: Request,
983
+ organization: dict = Depends(get_current_organization),
984
+ ):
985
+ """
986
+ Retrieve session history with Redis caching for hot loading.
987
+
988
+ Workers GET session messages before each run to restore conversation context.
989
+
990
+ Performance strategy:
991
+ 1. Check Redis cache first (hot path - milliseconds)
992
+ 2. Fall back to Supabase if not cached (cold path - ~50ms)
993
+ 3. Cache the result in Redis for next access
994
+ """
995
+ import json
996
+ try:
997
+ session_id = execution_id
998
+ redis_key = f"session:{session_id}"
999
+
1000
+ # Try Redis first for hot loading
1001
+ redis_client = get_redis_client()
1002
+ if redis_client:
1003
+ try:
1004
+ cached_session = await redis_client.get(redis_key)
1005
+ if cached_session:
1006
+ session_data = json.loads(cached_session)
1007
+ logger.info(
1008
+ "session_cache_hit",
1009
+ execution_id=execution_id,
1010
+ message_count=session_data.get("message_count", 0)
1011
+ )
1012
+ return session_data
1013
+ except Exception as redis_error:
1014
+ logger.warning("session_cache_error", error=str(redis_error))
1015
+ # Continue to DB fallback
1016
+
1017
+ # Redis miss or unavailable - load from Supabase
1018
+ client = get_supabase()
1019
+
1020
+ result = (
1021
+ client.table("sessions")
1022
+ .select("*")
1023
+ .eq("execution_id", execution_id)
1024
+ .eq("organization_id", organization["id"])
1025
+ .single()
1026
+ .execute()
1027
+ )
1028
+
1029
+ if not result.data:
1030
+ raise HTTPException(status_code=404, detail="Session not found")
1031
+
1032
+ session_record = result.data
1033
+ messages = session_record.get("messages", [])
1034
+
1035
+ session_data = {
1036
+ "session_id": session_record.get("session_id", execution_id),
1037
+ "execution_id": execution_id,
1038
+ "messages": messages,
1039
+ "message_count": len(messages),
1040
+ "metadata": session_record.get("metadata", {}),
1041
+ }
1042
+
1043
+ # Cache in Redis for next access (TTL: 1 hour)
1044
+ if redis_client:
1045
+ try:
1046
+ await redis_client.setex(
1047
+ redis_key,
1048
+ 3600, # 1 hour TTL
1049
+ json.dumps(session_data)
1050
+ )
1051
+ logger.info(
1052
+ "session_cached",
1053
+ execution_id=execution_id,
1054
+ message_count=len(messages)
1055
+ )
1056
+ except Exception as cache_error:
1057
+ logger.warning("session_cache_write_error", error=str(cache_error))
1058
+
1059
+ logger.info(
1060
+ "session_history_retrieved_from_supabase",
1061
+ execution_id=execution_id,
1062
+ session_id=session_record.get("session_id"),
1063
+ message_count=len(messages)
1064
+ )
1065
+
1066
+ return session_data
1067
+
1068
+ except HTTPException:
1069
+ raise
1070
+ except Exception as e:
1071
+ logger.error(
1072
+ "failed_to_retrieve_session_history",
1073
+ execution_id=execution_id,
1074
+ error=str(e)
1075
+ )
1076
+ raise HTTPException(
1077
+ status_code=500,
1078
+ detail=f"Failed to retrieve session history: {str(e)}"
1079
+ )
1080
+
1081
+
1082
+ @router.post("/{execution_id}/session", status_code=status.HTTP_201_CREATED)
1083
+ async def persist_session_history(
1084
+ execution_id: str,
1085
+ session_data: dict,
1086
+ request: Request,
1087
+ organization: dict = Depends(get_current_organization),
1088
+ ):
1089
+ """
1090
+ Persist session history from worker to Control Plane database.
1091
+
1092
+ Worker POSTs session messages after each run completion.
1093
+ This ensures history is available even when worker is offline.
1094
+
1095
+ Sessions are stored in Supabase for fast loading by the UI streaming endpoint.
1096
+ """
1097
+ try:
1098
+ client = get_supabase()
1099
+
1100
+ session_id = session_data.get("session_id", execution_id)
1101
+ user_id = session_data.get("user_id")
1102
+ messages = session_data.get("messages", [])
1103
+ metadata = session_data.get("metadata", {})
1104
+
1105
+ logger.info(
1106
+ "persisting_session_history",
1107
+ execution_id=execution_id,
1108
+ session_id=session_id,
1109
+ user_id=user_id,
1110
+ message_count=len(messages),
1111
+ org_id=organization["id"],
1112
+ )
1113
+
1114
+ # Upsert to Supabase sessions table
1115
+ # This matches what the streaming endpoint expects to load
1116
+ session_record = {
1117
+ "execution_id": execution_id,
1118
+ "session_id": session_id,
1119
+ "organization_id": organization["id"],
1120
+ "user_id": user_id,
1121
+ "messages": messages,
1122
+ "metadata": metadata,
1123
+ "updated_at": datetime.utcnow().isoformat(),
1124
+ }
1125
+
1126
+ result = (
1127
+ client.table("sessions")
1128
+ .upsert(session_record, on_conflict="execution_id")
1129
+ .execute()
1130
+ )
1131
+
1132
+ if not result.data:
1133
+ logger.error(
1134
+ "session_upsert_failed",
1135
+ execution_id=execution_id,
1136
+ session_id=session_id
1137
+ )
1138
+ return {
1139
+ "success": False,
1140
+ "error": "Failed to upsert session to database"
1141
+ }
1142
+
1143
+ logger.info(
1144
+ "session_persisted_to_supabase",
1145
+ execution_id=execution_id,
1146
+ session_id=session_id,
1147
+ message_count=len(messages),
1148
+ )
1149
+
1150
+ # Cache in Redis for hot loading on next access
1151
+ import json
1152
+
1153
+ redis_client = get_redis_client()
1154
+ if redis_client:
1155
+ try:
1156
+ redis_key = f"session:{session_id}"
1157
+ cache_data = {
1158
+ "session_id": session_id,
1159
+ "execution_id": execution_id,
1160
+ "messages": messages,
1161
+ "message_count": len(messages),
1162
+ }
1163
+ await redis_client.setex(
1164
+ redis_key,
1165
+ 3600, # 1 hour TTL
1166
+ json.dumps(cache_data)
1167
+ )
1168
+ logger.info(
1169
+ "session_cached_on_write",
1170
+ execution_id=execution_id,
1171
+ message_count=len(messages)
1172
+ )
1173
+ except Exception as cache_error:
1174
+ logger.warning("session_cache_write_error_on_persist", error=str(cache_error))
1175
+ # Don't fail persistence if caching fails
1176
+
1177
+ return {
1178
+ "success": True,
1179
+ "execution_id": execution_id,
1180
+ "session_id": session_id,
1181
+ "persisted_messages": len(messages),
1182
+ }
1183
+
1184
+ except Exception as e:
1185
+ logger.error(
1186
+ "session_persistence_failed",
1187
+ error=str(e),
1188
+ execution_id=execution_id,
1189
+ )
1190
+ return {
1191
+ "success": False,
1192
+ "error": str(e),
1193
+ }
1194
+
1195
+
1196
+ @router.post("/{execution_id}/mark-done", status_code=status.HTTP_202_ACCEPTED)
1197
+ async def mark_execution_as_done(
1198
+ execution_id: str,
1199
+ request: Request,
1200
+ organization: dict = Depends(get_current_organization),
1201
+ ):
1202
+ """
1203
+ Mark an execution as done, signaling the workflow to complete.
1204
+
1205
+ This sends a signal to the Temporal workflow to indicate the user is finished
1206
+ with the conversation. The workflow will complete gracefully after this signal.
1207
+ """
1208
+ try:
1209
+ # Get Temporal client
1210
+ temporal_client = await get_temporal_client()
1211
+
1212
+ # Verify the execution belongs to this organization and get execution type
1213
+ client = get_supabase()
1214
+ result = (
1215
+ client.table("executions")
1216
+ .select("id, organization_id, status, execution_type")
1217
+ .eq("id", execution_id)
1218
+ .eq("organization_id", organization["id"])
1219
+ .single()
1220
+ .execute()
1221
+ )
1222
+
1223
+ if not result.data:
1224
+ raise HTTPException(status_code=404, detail="Execution not found")
1225
+
1226
+ # Construct workflow ID based on execution type
1227
+ execution_type = result.data.get("execution_type", "AGENT")
1228
+ if execution_type == "TEAM":
1229
+ workflow_id = f"team-execution-{execution_id}"
1230
+ else:
1231
+ workflow_id = f"agent-execution-{execution_id}"
1232
+
1233
+ workflow_handle = temporal_client.get_workflow_handle(workflow_id)
1234
+
1235
+ # Send mark_as_done signal to workflow
1236
+ await workflow_handle.signal(AgentExecutionWorkflow.mark_as_done)
1237
+
1238
+ logger.info(
1239
+ "execution_marked_as_done",
1240
+ execution_id=execution_id,
1241
+ org_id=organization["id"],
1242
+ )
1243
+
1244
+ return {
1245
+ "success": True,
1246
+ "execution_id": execution_id,
1247
+ "message": "Execution marked as done, workflow will complete",
1248
+ }
1249
+
1250
+ except HTTPException:
1251
+ raise
1252
+ except Exception as e:
1253
+ logger.error(
1254
+ "mark_as_done_failed",
1255
+ error=str(e),
1256
+ execution_id=execution_id,
1257
+ )
1258
+ raise HTTPException(
1259
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
1260
+ detail=f"Failed to mark execution as done: {str(e)}"
1261
+ )
1262
+
1263
+
1264
+ class StreamingEventRequest(BaseModel):
1265
+ """Request to publish a streaming event to Redis for real-time UI updates"""
1266
+ event_type: str # "status", "message", "tool_started", "tool_completed", "error"
1267
+ data: dict # Event payload
1268
+ timestamp: str | None = None
1269
+
1270
+
1271
+ @router.post("/{execution_id}/events", status_code=status.HTTP_202_ACCEPTED)
1272
+ async def publish_execution_event(
1273
+ execution_id: str,
1274
+ event: StreamingEventRequest,
1275
+ request: Request,
1276
+ organization: dict = Depends(get_current_organization),
1277
+ ):
1278
+ """
1279
+ Publish a streaming event to Redis for real-time UI updates.
1280
+
1281
+ This endpoint is used by workers to send real-time events (tool execution, status updates, etc.)
1282
+ that are streamed to the UI via SSE without waiting for Temporal workflow completion.
1283
+
1284
+ Events are stored in Redis list: execution:{execution_id}:events
1285
+ TTL: 1 hour (events are temporary, final state persists in database)
1286
+ """
1287
+ try:
1288
+ redis_client = get_redis_client()
1289
+ if not redis_client:
1290
+ # Redis not configured - skip streaming but don't fail
1291
+ logger.warning("redis_not_configured_for_streaming", execution_id=execution_id)
1292
+ return {"success": True, "message": "Redis not configured, event skipped"}
1293
+
1294
+ # Skip database verification for performance - authentication already validates organization
1295
+ # Streaming events are temporary (1hr TTL) and don't need strict validation
1296
+ # The worker is already authenticated via API key which validates organization
1297
+
1298
+ # Build event payload
1299
+ event_data = {
1300
+ "event_type": event.event_type,
1301
+ "data": event.data,
1302
+ "timestamp": event.timestamp or datetime.utcnow().isoformat(),
1303
+ "execution_id": execution_id,
1304
+ }
1305
+
1306
+ # Push event to Redis list (most recent at head) - this must be FAST
1307
+ redis_key = f"execution:{execution_id}:events"
1308
+ await redis_client.lpush(redis_key, json.dumps(event_data))
1309
+
1310
+ # Keep only last 1000 events (prevent memory issues)
1311
+ await redis_client.ltrim(redis_key, 0, 999)
1312
+
1313
+ # Set TTL: 1 hour (events are temporary)
1314
+ await redis_client.expire(redis_key, 3600)
1315
+
1316
+ # Also publish to pub/sub channel for real-time streaming
1317
+ # This allows connected SSE clients to receive updates instantly
1318
+ pubsub_channel = f"execution:{execution_id}:stream"
1319
+ try:
1320
+ await redis_client.publish(pubsub_channel, json.dumps(event_data))
1321
+ except Exception as pubsub_error:
1322
+ # Don't fail if pub/sub fails - the list storage is the primary mechanism
1323
+ logger.debug("pubsub_publish_failed", error=str(pubsub_error), execution_id=execution_id[:8])
1324
+
1325
+ logger.info(
1326
+ "execution_event_published",
1327
+ execution_id=execution_id[:8],
1328
+ event_type=event.event_type,
1329
+ )
1330
+
1331
+ return {
1332
+ "success": True,
1333
+ "execution_id": execution_id,
1334
+ "event_type": event.event_type,
1335
+ }
1336
+
1337
+ except HTTPException:
1338
+ raise
1339
+ except Exception as e:
1340
+ logger.error(
1341
+ "publish_event_failed",
1342
+ error=str(e),
1343
+ execution_id=execution_id,
1344
+ event_type=event.event_type,
1345
+ )
1346
+ # Don't fail the worker if streaming fails - it's not critical
1347
+ return {
1348
+ "success": False,
1349
+ "error": str(e),
1350
+ "message": "Event publishing failed but execution continues"
1351
+ }
1352
+
1353
+
1354
+ @router.get("/{execution_id}/stream")
1355
+ async def stream_execution(
1356
+ execution_id: str,
1357
+ request: Request,
1358
+ organization: dict = Depends(get_current_organization),
1359
+ ):
1360
+ """
1361
+ Stream execution updates using Server-Sent Events (SSE).
1362
+
1363
+ This endpoint combines two sources for real-time updates:
1364
+ 1. Redis streaming events (from worker activities) - sub-second latency
1365
+ 2. Temporal workflow queries (for state consistency) - 200ms polling
1366
+
1367
+ The Redis events provide instant tool execution updates, while Temporal
1368
+ ensures we never miss state changes even if Redis is unavailable.
1369
+
1370
+ SSE format:
1371
+ - data: {json object with execution status, messages, tool calls}
1372
+ - event: status|message|tool_started|tool_completed|error|done
1373
+ """
1374
+
1375
+ async def generate_sse():
1376
+ """Generate Server-Sent Events from Agno session and Temporal workflow state"""
1377
+ import time
1378
+ start_time = time.time()
1379
+
1380
+ try:
1381
+ # Get Temporal client
1382
+ t0 = time.time()
1383
+ temporal_client = await get_temporal_client()
1384
+ logger.info("timing_temporal_client", duration_ms=int((time.time() - t0) * 1000), execution_id=execution_id)
1385
+
1386
+ # Check Redis cache first for execution_type (fast, sub-millisecond)
1387
+ execution_type = None
1388
+ redis_client = get_redis_client()
1389
+
1390
+ if redis_client:
1391
+ try:
1392
+ t0 = time.time()
1393
+ # Check if we have metadata event in Redis
1394
+ redis_key = f"execution:{execution_id}:events"
1395
+ redis_events = await redis_client.lrange(redis_key, 0, -1)
1396
+
1397
+ # Look for metadata event with execution_type
1398
+ if redis_events:
1399
+ for event_json in redis_events:
1400
+ try:
1401
+ event_data = json.loads(event_json)
1402
+ if event_data.get("event_type") == "metadata" and event_data.get("data", {}).get("execution_type"):
1403
+ execution_type = event_data["data"]["execution_type"]
1404
+ logger.info("timing_redis_cache_hit", duration_ms=int((time.time() - t0) * 1000), execution_id=execution_id, execution_type=execution_type)
1405
+ break
1406
+ except json.JSONDecodeError:
1407
+ continue
1408
+ except Exception as redis_error:
1409
+ logger.warning("redis_cache_lookup_failed", error=str(redis_error), execution_id=execution_id)
1410
+
1411
+ # Fall back to database if not in cache
1412
+ if not execution_type:
1413
+ t0 = time.time()
1414
+ client = get_supabase()
1415
+ exec_result = (
1416
+ client.table("executions")
1417
+ .select("id, execution_type")
1418
+ .eq("id", execution_id)
1419
+ .eq("organization_id", organization["id"])
1420
+ .single()
1421
+ .execute()
1422
+ )
1423
+ logger.info("timing_db_query_fallback", duration_ms=int((time.time() - t0) * 1000), execution_id=execution_id)
1424
+
1425
+ if not exec_result.data:
1426
+ raise HTTPException(status_code=404, detail="Execution not found")
1427
+
1428
+ execution_type = exec_result.data.get("execution_type", "AGENT")
1429
+
1430
+ # Construct workflow ID based on execution type
1431
+ # Team executions use "team-execution-{id}", agent executions use "agent-execution-{id}"
1432
+ if execution_type == "TEAM":
1433
+ workflow_id = f"team-execution-{execution_id}"
1434
+ else:
1435
+ workflow_id = f"agent-execution-{execution_id}"
1436
+
1437
+ workflow_handle = temporal_client.get_workflow_handle(workflow_id)
1438
+
1439
+ logger.info(
1440
+ "execution_stream_connecting",
1441
+ execution_id=execution_id,
1442
+ execution_type=execution_type,
1443
+ workflow_id=workflow_id,
1444
+ )
1445
+
1446
+ last_status = None
1447
+ last_message_count = 0
1448
+ last_keepalive = asyncio.get_event_loop().time()
1449
+ last_redis_event_index = -1 # Track which Redis events we've sent
1450
+ consecutive_failures = 0 # Track consecutive workflow query failures
1451
+ worker_down_mode = False # Track if we're in worker-down fallback mode
1452
+ last_db_poll = 0 # Track last database poll time
1453
+
1454
+ # Check if worker is ACTIVELY processing by checking Temporal workflow execution status
1455
+ # This is much more performant than querying workflow state - it's just a metadata lookup
1456
+ # We only stream from Redis if workflow is RUNNING at Temporal level (worker is active)
1457
+ # Otherwise, we load from database (workflow completed/failed/no active worker)
1458
+ is_workflow_running = False
1459
+ try:
1460
+ t0 = time.time()
1461
+ description = await workflow_handle.describe()
1462
+ # Temporal execution status: RUNNING, COMPLETED, FAILED, CANCELLED, TERMINATED, TIMED_OUT, CONTINUED_AS_NEW
1463
+ # Use .name to get just the enum name (e.g., "RUNNING") without the class prefix
1464
+ temporal_status_name = description.status.name
1465
+ is_workflow_running = temporal_status_name == "RUNNING"
1466
+ logger.info(
1467
+ "initial_workflow_status",
1468
+ execution_id=execution_id,
1469
+ temporal_status=temporal_status_name,
1470
+ temporal_status_full=str(description.status),
1471
+ is_running=is_workflow_running,
1472
+ duration_ms=int((time.time() - t0) * 1000)
1473
+ )
1474
+ except Exception as describe_error:
1475
+ # If we can't describe workflow, assume it's not running
1476
+ logger.warning("initial_workflow_describe_failed", execution_id=execution_id, error=str(describe_error))
1477
+ is_workflow_running = False
1478
+
1479
+ # ALWAYS load historical messages from database first
1480
+ # This ensures UI sees conversation history even when connecting mid-execution
1481
+ # In streaming mode, we'll then continue with Redis for real-time updates
1482
+ t0 = time.time()
1483
+ try:
1484
+ # Read session from Control Plane database (where worker persists)
1485
+ client = get_supabase()
1486
+ session_result = (
1487
+ client.table("sessions")
1488
+ .select("messages")
1489
+ .eq("execution_id", execution_id)
1490
+ .order("updated_at", desc=True)
1491
+ .limit(1)
1492
+ .execute()
1493
+ )
1494
+
1495
+ session_messages = []
1496
+ if session_result.data and len(session_result.data) > 0:
1497
+ messages_data = session_result.data[0].get("messages", [])
1498
+ # Convert dict messages to objects with attributes
1499
+ from dataclasses import dataclass, field
1500
+ from typing import Optional as Opt
1501
+
1502
+ @dataclass
1503
+ class SessionMessage:
1504
+ role: str
1505
+ content: str
1506
+ timestamp: Opt[str] = None
1507
+ user_id: Opt[str] = None
1508
+ user_name: Opt[str] = None
1509
+ user_email: Opt[str] = None
1510
+ user_avatar: Opt[str] = None
1511
+
1512
+ session_messages = [SessionMessage(**msg) for msg in messages_data]
1513
+
1514
+ if session_messages:
1515
+ logger.info(
1516
+ "sending_session_history_on_connect",
1517
+ execution_id=execution_id,
1518
+ message_count=len(session_messages)
1519
+ )
1520
+
1521
+ # Send all existing messages immediately
1522
+ for msg in session_messages:
1523
+ msg_data = {
1524
+ "role": msg.role,
1525
+ "content": msg.content,
1526
+ "timestamp": msg.timestamp, # Already in ISO format from database
1527
+ }
1528
+ # Include user attribution if available
1529
+ if msg.user_id:
1530
+ msg_data["user_id"] = msg.user_id
1531
+ msg_data["user_name"] = msg.user_name
1532
+ msg_data["user_email"] = msg.user_email
1533
+ msg_data["user_avatar"] = msg.user_avatar
1534
+ yield f"event: message\n"
1535
+ yield f"data: {json.dumps(msg_data)}\n\n"
1536
+
1537
+ last_message_count = len(session_messages)
1538
+
1539
+ logger.info("timing_session_history_load", duration_ms=int((time.time() - t0) * 1000), execution_id=execution_id, message_count=last_message_count)
1540
+
1541
+ except Exception as session_error:
1542
+ logger.warning(
1543
+ "failed_to_load_session_history",
1544
+ execution_id=execution_id,
1545
+ error=str(session_error),
1546
+ duration_ms=int((time.time() - t0) * 1000)
1547
+ )
1548
+ # Continue even if session loading fails - workflow state will still work
1549
+
1550
+ while True:
1551
+ # Check if client disconnected
1552
+ if await request.is_disconnected():
1553
+ logger.info("execution_stream_disconnected", execution_id=execution_id)
1554
+ break
1555
+
1556
+ # Send keepalive comment every 15 seconds to prevent timeout
1557
+ current_time = asyncio.get_event_loop().time()
1558
+ if current_time - last_keepalive > 15:
1559
+ yield ": keepalive\n\n"
1560
+ last_keepalive = current_time
1561
+
1562
+ # FIRST: Check Redis for NEW real-time streaming events (sub-second latency)
1563
+ # ONLY if workflow is actively running (worker is connected and processing)
1564
+ # We track which events we've sent to avoid re-sending
1565
+ if is_workflow_running and redis_client:
1566
+ try:
1567
+ redis_key = f"execution:{execution_id}:events"
1568
+ # Get the total count of events in Redis
1569
+ total_events = await redis_client.llen(redis_key)
1570
+
1571
+ if total_events and total_events > (last_redis_event_index + 1):
1572
+ # There are new events we haven't sent yet
1573
+ logger.debug(
1574
+ "redis_new_events_found",
1575
+ execution_id=execution_id,
1576
+ total=total_events,
1577
+ last_index=last_redis_event_index
1578
+ )
1579
+
1580
+ # Get all events (they're in reverse chronological order from LPUSH)
1581
+ all_redis_events = await redis_client.lrange(redis_key, 0, -1)
1582
+
1583
+ if all_redis_events:
1584
+ # Reverse to get chronological order (oldest first)
1585
+ chronological_events = list(reversed(all_redis_events))
1586
+
1587
+ # Send only NEW events we haven't sent yet
1588
+ for i in range(last_redis_event_index + 1, len(chronological_events)):
1589
+ event_json = chronological_events[i]
1590
+
1591
+ try:
1592
+ event_data = json.loads(event_json)
1593
+ event_type = event_data.get("event_type", "message")
1594
+
1595
+ # For message events with wrapped data (pause/resume/cancel system messages),
1596
+ # extract just the message data. For other events, send as-is.
1597
+ if event_type == "message" and "data" in event_data and isinstance(event_data["data"], dict) and "role" in event_data["data"]:
1598
+ # This is a new-style system message with role/content in data field
1599
+ payload = event_data["data"]
1600
+ else:
1601
+ # This is an existing event format - send the whole event_data
1602
+ payload = event_data
1603
+
1604
+ # Stream the event to UI
1605
+ yield f"event: {event_type}\n"
1606
+ yield f"data: {json.dumps(payload)}\n\n"
1607
+
1608
+ last_redis_event_index = i
1609
+
1610
+ logger.debug(
1611
+ "redis_event_streamed",
1612
+ execution_id=execution_id,
1613
+ event_type=event_type,
1614
+ index=i
1615
+ )
1616
+
1617
+ except json.JSONDecodeError:
1618
+ logger.warning("invalid_redis_event_json", event=event_json[:100])
1619
+ continue
1620
+
1621
+ except Exception as redis_error:
1622
+ logger.error("redis_event_read_failed", error=str(redis_error), execution_id=execution_id)
1623
+ # Continue with Temporal polling even if Redis fails
1624
+
1625
+ try:
1626
+ # SECOND: Check Temporal workflow execution status (lightweight metadata lookup)
1627
+ t0 = time.time()
1628
+ description = await workflow_handle.describe()
1629
+ temporal_status = description.status.name # Get enum name (e.g., "RUNNING")
1630
+ describe_duration = int((time.time() - t0) * 1000)
1631
+
1632
+ # Log slow describe calls (>100ms)
1633
+ if describe_duration > 100:
1634
+ logger.warning("slow_temporal_describe", duration_ms=describe_duration, execution_id=execution_id)
1635
+
1636
+ # Update is_workflow_running based on Temporal execution status
1637
+ # Only stream from Redis when workflow is actively being processed by a worker
1638
+ previous_running_state = is_workflow_running
1639
+ is_workflow_running = temporal_status == "RUNNING"
1640
+
1641
+ # Log when streaming mode changes
1642
+ if previous_running_state != is_workflow_running:
1643
+ logger.info(
1644
+ "streaming_mode_changed",
1645
+ execution_id=execution_id,
1646
+ temporal_status=temporal_status,
1647
+ is_workflow_running=is_workflow_running,
1648
+ mode="redis_streaming" if is_workflow_running else "database_only"
1649
+ )
1650
+
1651
+ # If workflow finished, send appropriate event and exit
1652
+ if temporal_status in ["COMPLETED", "FAILED", "TERMINATED", "CANCELLED"]:
1653
+ # Query workflow state one final time to get the complete results
1654
+ try:
1655
+ state = await workflow_handle.query(AgentExecutionWorkflow.get_state)
1656
+
1657
+ if temporal_status in ["COMPLETED", "TERMINATED"]:
1658
+ done_data = {
1659
+ "execution_id": execution_id,
1660
+ "status": "completed",
1661
+ "response": state.current_response,
1662
+ "usage": state.usage,
1663
+ "metadata": state.metadata,
1664
+ }
1665
+ yield f"event: done\n"
1666
+ yield f"data: {json.dumps(done_data)}\n\n"
1667
+ else: # FAILED or CANCELLED
1668
+ error_data = {
1669
+ "error": state.error_message or f"Workflow {temporal_status.lower()}",
1670
+ "execution_id": execution_id,
1671
+ "status": "failed",
1672
+ }
1673
+ if state.metadata.get("error_type"):
1674
+ error_data["error_type"] = state.metadata["error_type"]
1675
+ yield f"event: error\n"
1676
+ yield f"data: {json.dumps(error_data)}\n\n"
1677
+ except Exception as final_query_error:
1678
+ # If we can't query for final state, fall back to database
1679
+ logger.warning("final_state_query_failed", execution_id=execution_id, error=str(final_query_error))
1680
+
1681
+ # Try to get final status from database
1682
+ try:
1683
+ exec_result = (
1684
+ client.table("executions")
1685
+ .select("status, response, error_message, usage, execution_metadata")
1686
+ .eq("id", execution_id)
1687
+ .single()
1688
+ .execute()
1689
+ )
1690
+
1691
+ if exec_result.data:
1692
+ if temporal_status in ["COMPLETED", "TERMINATED"]:
1693
+ done_data = {
1694
+ "execution_id": execution_id,
1695
+ "status": exec_result.data.get("status", "completed"),
1696
+ "response": exec_result.data.get("response"),
1697
+ "usage": exec_result.data.get("usage", {}),
1698
+ "metadata": exec_result.data.get("execution_metadata", {}),
1699
+ }
1700
+ yield f"event: done\n"
1701
+ yield f"data: {json.dumps(done_data)}\n\n"
1702
+ else:
1703
+ error_data = {
1704
+ "error": exec_result.data.get("error_message") or f"Workflow {temporal_status.lower()}",
1705
+ "execution_id": execution_id,
1706
+ "status": exec_result.data.get("status", "failed"),
1707
+ }
1708
+ yield f"event: error\n"
1709
+ yield f"data: {json.dumps(error_data)}\n\n"
1710
+ else:
1711
+ yield f"event: done\n"
1712
+ yield f"data: {json.dumps({'execution_id': execution_id, 'workflow_status': temporal_status})}\n\n"
1713
+ except Exception as db_error:
1714
+ logger.error("database_fallback_failed", execution_id=execution_id, error=str(db_error))
1715
+ yield f"event: done\n"
1716
+ yield f"data: {json.dumps({'execution_id': execution_id, 'workflow_status': temporal_status})}\n\n"
1717
+ break
1718
+
1719
+ # THIRD: Query workflow state for application-level details (messages, usage, etc.)
1720
+ # Only do this if workflow is still running to get incremental updates
1721
+ try:
1722
+ state = await workflow_handle.query(AgentExecutionWorkflow.get_state)
1723
+
1724
+ # Reset failure counter on successful query
1725
+ if consecutive_failures > 0:
1726
+ logger.info(
1727
+ "workflow_query_recovered",
1728
+ execution_id=execution_id,
1729
+ failures=consecutive_failures
1730
+ )
1731
+ consecutive_failures = 0
1732
+ worker_down_mode = False
1733
+
1734
+ # Send status update if changed
1735
+ if state.status != last_status:
1736
+ yield f"event: status\n"
1737
+ yield f"data: {json.dumps({'status': state.status, 'execution_id': execution_id})}\n\n"
1738
+ last_status = state.status
1739
+
1740
+ logger.info(
1741
+ "execution_status_update",
1742
+ execution_id=execution_id,
1743
+ status=state.status
1744
+ )
1745
+
1746
+ # Send new messages incrementally
1747
+ # Skip assistant messages - they're already streamed via message_chunk events
1748
+ if len(state.messages) > last_message_count:
1749
+ new_messages = state.messages[last_message_count:]
1750
+ for msg in new_messages:
1751
+ # Skip assistant messages to prevent duplicates with chunk streaming
1752
+ if msg.role == "assistant":
1753
+ continue
1754
+
1755
+ msg_data = {
1756
+ "role": msg.role,
1757
+ "content": msg.content,
1758
+ "timestamp": msg.timestamp,
1759
+ }
1760
+ if msg.tool_name:
1761
+ msg_data["tool_name"] = msg.tool_name
1762
+ msg_data["tool_input"] = msg.tool_input
1763
+ msg_data["tool_output"] = msg.tool_output
1764
+ # Include user attribution for messages
1765
+ if hasattr(msg, 'user_id') and msg.user_id:
1766
+ msg_data["user_id"] = msg.user_id
1767
+ msg_data["user_name"] = msg.user_name
1768
+ msg_data["user_email"] = msg.user_email
1769
+ msg_data["user_avatar"] = msg.user_avatar
1770
+
1771
+ yield f"event: message\n"
1772
+ yield f"data: {json.dumps(msg_data)}\n\n"
1773
+
1774
+ last_message_count = len(state.messages)
1775
+
1776
+ except Exception as query_error:
1777
+ # Workflow query failed - track failures and switch to database fallback
1778
+ consecutive_failures += 1
1779
+ error_msg = str(query_error)
1780
+
1781
+ # Detect worker down condition
1782
+ is_worker_down = "no poller seen" in error_msg or "workflow not found" in error_msg
1783
+
1784
+ if consecutive_failures >= 3 and not worker_down_mode:
1785
+ worker_down_mode = True
1786
+ logger.warning(
1787
+ "worker_down_detected_switching_to_database_mode",
1788
+ execution_id=execution_id,
1789
+ failures=consecutive_failures,
1790
+ error=error_msg
1791
+ )
1792
+
1793
+ # In worker down mode, poll database for updates
1794
+ if worker_down_mode:
1795
+ current_time = time.time()
1796
+ # Poll database every 2 seconds when worker is down
1797
+ if current_time - last_db_poll >= 2.0:
1798
+ try:
1799
+ # Check execution status from database
1800
+ exec_result = (
1801
+ client.table("executions")
1802
+ .select("status, response, error_message")
1803
+ .eq("id", execution_id)
1804
+ .single()
1805
+ .execute()
1806
+ )
1807
+
1808
+ if exec_result.data:
1809
+ db_status = exec_result.data.get("status")
1810
+
1811
+ # Send status update if changed
1812
+ if db_status and db_status != last_status:
1813
+ yield f"event: status\n"
1814
+ yield f"data: {json.dumps({'status': db_status, 'execution_id': execution_id, 'source': 'database'})}\n\n"
1815
+ last_status = db_status
1816
+
1817
+ logger.info(
1818
+ "database_status_update",
1819
+ execution_id=execution_id,
1820
+ status=db_status
1821
+ )
1822
+
1823
+ # Check if execution finished
1824
+ if db_status in ["completed", "failed", "cancelled"]:
1825
+ if db_status == "completed":
1826
+ done_data = {
1827
+ "execution_id": execution_id,
1828
+ "status": db_status,
1829
+ "response": exec_result.data.get("response"),
1830
+ }
1831
+ yield f"event: done\n"
1832
+ yield f"data: {json.dumps(done_data)}\n\n"
1833
+ else:
1834
+ error_data = {
1835
+ "error": exec_result.data.get("error_message") or f"Execution {db_status}",
1836
+ "execution_id": execution_id,
1837
+ "status": db_status,
1838
+ }
1839
+ yield f"event: error\n"
1840
+ yield f"data: {json.dumps(error_data)}\n\n"
1841
+ break
1842
+
1843
+ # Check for new session messages
1844
+ session_result = (
1845
+ client.table("sessions")
1846
+ .select("messages")
1847
+ .eq("execution_id", execution_id)
1848
+ .single()
1849
+ .execute()
1850
+ )
1851
+
1852
+ if session_result.data:
1853
+ db_messages = session_result.data.get("messages", [])
1854
+ if len(db_messages) > last_message_count:
1855
+ new_messages = db_messages[last_message_count:]
1856
+ for msg_dict in new_messages:
1857
+ yield f"event: message\n"
1858
+ yield f"data: {json.dumps(msg_dict)}\n\n"
1859
+ last_message_count = len(db_messages)
1860
+
1861
+ logger.info(
1862
+ "database_messages_update",
1863
+ execution_id=execution_id,
1864
+ new_messages=len(new_messages)
1865
+ )
1866
+
1867
+ last_db_poll = current_time
1868
+
1869
+ except Exception as db_poll_error:
1870
+ logger.error(
1871
+ "database_poll_failed",
1872
+ execution_id=execution_id,
1873
+ error=str(db_poll_error)
1874
+ )
1875
+ else:
1876
+ # Still trying to connect to worker - log but don't switch modes yet
1877
+ logger.debug(
1878
+ "workflow_query_failed",
1879
+ execution_id=execution_id,
1880
+ failures=consecutive_failures,
1881
+ error=error_msg
1882
+ )
1883
+
1884
+ # Poll every 200ms for real-time updates when worker is up
1885
+ # Poll every 500ms when in worker down mode (database polling)
1886
+ await asyncio.sleep(0.5 if worker_down_mode else 0.2)
1887
+
1888
+ except Exception as error:
1889
+ # Critical error (e.g., workflow describe failed)
1890
+ logger.error(
1891
+ "critical_streaming_error",
1892
+ execution_id=execution_id,
1893
+ error=str(error)
1894
+ )
1895
+ # Back off and retry
1896
+ await asyncio.sleep(1.0)
1897
+
1898
+ except Exception as e:
1899
+ logger.error("execution_stream_error", error=str(e), execution_id=execution_id)
1900
+ yield f"event: error\n"
1901
+ yield f"data: {json.dumps({'error': str(e)})}\n\n"
1902
+
1903
+ return StreamingResponse(
1904
+ generate_sse(),
1905
+ media_type="text/event-stream",
1906
+ headers={
1907
+ "Cache-Control": "no-cache",
1908
+ "Connection": "keep-alive",
1909
+ "X-Accel-Buffering": "no", # Disable nginx buffering
1910
+ }
1911
+ )