whale-code 6.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/README.md +95 -0
  2. package/bin/swag-agent.js +9 -0
  3. package/bin/swagmanager-mcp.js +321 -0
  4. package/dist/cli/app.d.ts +26 -0
  5. package/dist/cli/app.js +64 -0
  6. package/dist/cli/chat/AgentSelector.d.ts +14 -0
  7. package/dist/cli/chat/AgentSelector.js +14 -0
  8. package/dist/cli/chat/ChatApp.d.ts +9 -0
  9. package/dist/cli/chat/ChatApp.js +267 -0
  10. package/dist/cli/chat/ChatInput.d.ts +39 -0
  11. package/dist/cli/chat/ChatInput.js +509 -0
  12. package/dist/cli/chat/MarkdownText.d.ts +10 -0
  13. package/dist/cli/chat/MarkdownText.js +20 -0
  14. package/dist/cli/chat/MessageList.d.ts +37 -0
  15. package/dist/cli/chat/MessageList.js +80 -0
  16. package/dist/cli/chat/ModelSelector.d.ts +20 -0
  17. package/dist/cli/chat/ModelSelector.js +73 -0
  18. package/dist/cli/chat/RewindViewer.d.ts +26 -0
  19. package/dist/cli/chat/RewindViewer.js +185 -0
  20. package/dist/cli/chat/StoreSelector.d.ts +14 -0
  21. package/dist/cli/chat/StoreSelector.js +24 -0
  22. package/dist/cli/chat/StreamingText.d.ts +12 -0
  23. package/dist/cli/chat/StreamingText.js +12 -0
  24. package/dist/cli/chat/SubagentPanel.d.ts +45 -0
  25. package/dist/cli/chat/SubagentPanel.js +110 -0
  26. package/dist/cli/chat/TeamPanel.d.ts +21 -0
  27. package/dist/cli/chat/TeamPanel.js +42 -0
  28. package/dist/cli/chat/ToolIndicator.d.ts +25 -0
  29. package/dist/cli/chat/ToolIndicator.js +436 -0
  30. package/dist/cli/chat/hooks/useAgentLoop.d.ts +39 -0
  31. package/dist/cli/chat/hooks/useAgentLoop.js +382 -0
  32. package/dist/cli/chat/hooks/useSlashCommands.d.ts +37 -0
  33. package/dist/cli/chat/hooks/useSlashCommands.js +387 -0
  34. package/dist/cli/commands/config-cmd.d.ts +10 -0
  35. package/dist/cli/commands/config-cmd.js +99 -0
  36. package/dist/cli/commands/doctor.d.ts +14 -0
  37. package/dist/cli/commands/doctor.js +172 -0
  38. package/dist/cli/commands/init.d.ts +16 -0
  39. package/dist/cli/commands/init.js +278 -0
  40. package/dist/cli/commands/mcp.d.ts +12 -0
  41. package/dist/cli/commands/mcp.js +162 -0
  42. package/dist/cli/login/LoginApp.d.ts +7 -0
  43. package/dist/cli/login/LoginApp.js +157 -0
  44. package/dist/cli/print-mode.d.ts +31 -0
  45. package/dist/cli/print-mode.js +202 -0
  46. package/dist/cli/serve-mode.d.ts +37 -0
  47. package/dist/cli/serve-mode.js +636 -0
  48. package/dist/cli/services/agent-definitions.d.ts +25 -0
  49. package/dist/cli/services/agent-definitions.js +91 -0
  50. package/dist/cli/services/agent-events.d.ts +178 -0
  51. package/dist/cli/services/agent-events.js +175 -0
  52. package/dist/cli/services/agent-loop.d.ts +90 -0
  53. package/dist/cli/services/agent-loop.js +762 -0
  54. package/dist/cli/services/agent-worker-base.d.ts +97 -0
  55. package/dist/cli/services/agent-worker-base.js +220 -0
  56. package/dist/cli/services/auth-service.d.ts +30 -0
  57. package/dist/cli/services/auth-service.js +160 -0
  58. package/dist/cli/services/background-processes.d.ts +126 -0
  59. package/dist/cli/services/background-processes.js +318 -0
  60. package/dist/cli/services/browser-auth.d.ts +24 -0
  61. package/dist/cli/services/browser-auth.js +180 -0
  62. package/dist/cli/services/claude-md-loader.d.ts +16 -0
  63. package/dist/cli/services/claude-md-loader.js +58 -0
  64. package/dist/cli/services/config-store.d.ts +47 -0
  65. package/dist/cli/services/config-store.js +79 -0
  66. package/dist/cli/services/debug-log.d.ts +10 -0
  67. package/dist/cli/services/debug-log.js +52 -0
  68. package/dist/cli/services/error-logger.d.ts +58 -0
  69. package/dist/cli/services/error-logger.js +269 -0
  70. package/dist/cli/services/file-history.d.ts +21 -0
  71. package/dist/cli/services/file-history.js +83 -0
  72. package/dist/cli/services/format-server-response.d.ts +16 -0
  73. package/dist/cli/services/format-server-response.js +440 -0
  74. package/dist/cli/services/git-context.d.ts +11 -0
  75. package/dist/cli/services/git-context.js +66 -0
  76. package/dist/cli/services/hooks.d.ts +85 -0
  77. package/dist/cli/services/hooks.js +258 -0
  78. package/dist/cli/services/interactive-tools.d.ts +125 -0
  79. package/dist/cli/services/interactive-tools.js +260 -0
  80. package/dist/cli/services/keybinding-manager.d.ts +52 -0
  81. package/dist/cli/services/keybinding-manager.js +115 -0
  82. package/dist/cli/services/local-tools.d.ts +22 -0
  83. package/dist/cli/services/local-tools.js +697 -0
  84. package/dist/cli/services/lsp-manager.d.ts +18 -0
  85. package/dist/cli/services/lsp-manager.js +717 -0
  86. package/dist/cli/services/mcp-client.d.ts +48 -0
  87. package/dist/cli/services/mcp-client.js +157 -0
  88. package/dist/cli/services/memory-manager.d.ts +16 -0
  89. package/dist/cli/services/memory-manager.js +57 -0
  90. package/dist/cli/services/model-manager.d.ts +18 -0
  91. package/dist/cli/services/model-manager.js +71 -0
  92. package/dist/cli/services/model-router.d.ts +26 -0
  93. package/dist/cli/services/model-router.js +149 -0
  94. package/dist/cli/services/permission-modes.d.ts +13 -0
  95. package/dist/cli/services/permission-modes.js +43 -0
  96. package/dist/cli/services/rewind.d.ts +84 -0
  97. package/dist/cli/services/rewind.js +194 -0
  98. package/dist/cli/services/ripgrep.d.ts +28 -0
  99. package/dist/cli/services/ripgrep.js +138 -0
  100. package/dist/cli/services/sandbox.d.ts +29 -0
  101. package/dist/cli/services/sandbox.js +97 -0
  102. package/dist/cli/services/server-tools.d.ts +61 -0
  103. package/dist/cli/services/server-tools.js +543 -0
  104. package/dist/cli/services/session-persistence.d.ts +23 -0
  105. package/dist/cli/services/session-persistence.js +99 -0
  106. package/dist/cli/services/subagent-worker.d.ts +19 -0
  107. package/dist/cli/services/subagent-worker.js +41 -0
  108. package/dist/cli/services/subagent.d.ts +47 -0
  109. package/dist/cli/services/subagent.js +647 -0
  110. package/dist/cli/services/system-prompt.d.ts +7 -0
  111. package/dist/cli/services/system-prompt.js +198 -0
  112. package/dist/cli/services/team-lead.d.ts +73 -0
  113. package/dist/cli/services/team-lead.js +512 -0
  114. package/dist/cli/services/team-state.d.ts +77 -0
  115. package/dist/cli/services/team-state.js +398 -0
  116. package/dist/cli/services/teammate.d.ts +31 -0
  117. package/dist/cli/services/teammate.js +689 -0
  118. package/dist/cli/services/telemetry.d.ts +61 -0
  119. package/dist/cli/services/telemetry.js +209 -0
  120. package/dist/cli/services/tools/agent-tools.d.ts +14 -0
  121. package/dist/cli/services/tools/agent-tools.js +347 -0
  122. package/dist/cli/services/tools/file-ops.d.ts +15 -0
  123. package/dist/cli/services/tools/file-ops.js +487 -0
  124. package/dist/cli/services/tools/search-tools.d.ts +8 -0
  125. package/dist/cli/services/tools/search-tools.js +186 -0
  126. package/dist/cli/services/tools/shell-exec.d.ts +10 -0
  127. package/dist/cli/services/tools/shell-exec.js +168 -0
  128. package/dist/cli/services/tools/task-manager.d.ts +28 -0
  129. package/dist/cli/services/tools/task-manager.js +209 -0
  130. package/dist/cli/services/tools/web-tools.d.ts +11 -0
  131. package/dist/cli/services/tools/web-tools.js +395 -0
  132. package/dist/cli/setup/SetupApp.d.ts +9 -0
  133. package/dist/cli/setup/SetupApp.js +191 -0
  134. package/dist/cli/shared/MatrixIntro.d.ts +4 -0
  135. package/dist/cli/shared/MatrixIntro.js +83 -0
  136. package/dist/cli/shared/Theme.d.ts +74 -0
  137. package/dist/cli/shared/Theme.js +127 -0
  138. package/dist/cli/shared/WhaleBanner.d.ts +10 -0
  139. package/dist/cli/shared/WhaleBanner.js +12 -0
  140. package/dist/cli/shared/markdown.d.ts +21 -0
  141. package/dist/cli/shared/markdown.js +756 -0
  142. package/dist/cli/status/StatusApp.d.ts +4 -0
  143. package/dist/cli/status/StatusApp.js +105 -0
  144. package/dist/cli/stores/StoreApp.d.ts +7 -0
  145. package/dist/cli/stores/StoreApp.js +81 -0
  146. package/dist/index.d.ts +15 -0
  147. package/dist/index.js +538 -0
  148. package/dist/local-agent/connection.d.ts +48 -0
  149. package/dist/local-agent/connection.js +332 -0
  150. package/dist/local-agent/discovery.d.ts +18 -0
  151. package/dist/local-agent/discovery.js +146 -0
  152. package/dist/local-agent/executor.d.ts +34 -0
  153. package/dist/local-agent/executor.js +241 -0
  154. package/dist/local-agent/index.d.ts +14 -0
  155. package/dist/local-agent/index.js +198 -0
  156. package/dist/node/adapters/base.d.ts +35 -0
  157. package/dist/node/adapters/base.js +10 -0
  158. package/dist/node/adapters/discord.d.ts +29 -0
  159. package/dist/node/adapters/discord.js +299 -0
  160. package/dist/node/adapters/email.d.ts +23 -0
  161. package/dist/node/adapters/email.js +218 -0
  162. package/dist/node/adapters/imessage.d.ts +17 -0
  163. package/dist/node/adapters/imessage.js +118 -0
  164. package/dist/node/adapters/slack.d.ts +26 -0
  165. package/dist/node/adapters/slack.js +259 -0
  166. package/dist/node/adapters/sms.d.ts +23 -0
  167. package/dist/node/adapters/sms.js +161 -0
  168. package/dist/node/adapters/telegram.d.ts +17 -0
  169. package/dist/node/adapters/telegram.js +101 -0
  170. package/dist/node/adapters/webchat.d.ts +27 -0
  171. package/dist/node/adapters/webchat.js +160 -0
  172. package/dist/node/adapters/whatsapp.d.ts +28 -0
  173. package/dist/node/adapters/whatsapp.js +230 -0
  174. package/dist/node/cli.d.ts +2 -0
  175. package/dist/node/cli.js +325 -0
  176. package/dist/node/config.d.ts +17 -0
  177. package/dist/node/config.js +31 -0
  178. package/dist/node/runtime.d.ts +50 -0
  179. package/dist/node/runtime.js +351 -0
  180. package/dist/server/handlers/__test-utils__/mock-supabase.d.ts +11 -0
  181. package/dist/server/handlers/__test-utils__/mock-supabase.js +393 -0
  182. package/dist/server/handlers/analytics.d.ts +17 -0
  183. package/dist/server/handlers/analytics.js +266 -0
  184. package/dist/server/handlers/api-keys.d.ts +6 -0
  185. package/dist/server/handlers/api-keys.js +221 -0
  186. package/dist/server/handlers/billing.d.ts +33 -0
  187. package/dist/server/handlers/billing.js +272 -0
  188. package/dist/server/handlers/browser.d.ts +10 -0
  189. package/dist/server/handlers/browser.js +517 -0
  190. package/dist/server/handlers/catalog.d.ts +99 -0
  191. package/dist/server/handlers/catalog.js +976 -0
  192. package/dist/server/handlers/comms.d.ts +254 -0
  193. package/dist/server/handlers/comms.js +588 -0
  194. package/dist/server/handlers/creations.d.ts +6 -0
  195. package/dist/server/handlers/creations.js +479 -0
  196. package/dist/server/handlers/crm.d.ts +89 -0
  197. package/dist/server/handlers/crm.js +538 -0
  198. package/dist/server/handlers/discovery.d.ts +6 -0
  199. package/dist/server/handlers/discovery.js +288 -0
  200. package/dist/server/handlers/embeddings.d.ts +92 -0
  201. package/dist/server/handlers/embeddings.js +197 -0
  202. package/dist/server/handlers/enrichment.d.ts +8 -0
  203. package/dist/server/handlers/enrichment.js +768 -0
  204. package/dist/server/handlers/image-gen.d.ts +6 -0
  205. package/dist/server/handlers/image-gen.js +409 -0
  206. package/dist/server/handlers/inventory.d.ts +319 -0
  207. package/dist/server/handlers/inventory.js +447 -0
  208. package/dist/server/handlers/kali.d.ts +10 -0
  209. package/dist/server/handlers/kali.js +210 -0
  210. package/dist/server/handlers/llm-providers.d.ts +6 -0
  211. package/dist/server/handlers/llm-providers.js +673 -0
  212. package/dist/server/handlers/local-agent.d.ts +6 -0
  213. package/dist/server/handlers/local-agent.js +118 -0
  214. package/dist/server/handlers/meta-ads.d.ts +111 -0
  215. package/dist/server/handlers/meta-ads.js +2279 -0
  216. package/dist/server/handlers/nodes.d.ts +33 -0
  217. package/dist/server/handlers/nodes.js +699 -0
  218. package/dist/server/handlers/operations.d.ts +138 -0
  219. package/dist/server/handlers/operations.js +131 -0
  220. package/dist/server/handlers/platform.d.ts +23 -0
  221. package/dist/server/handlers/platform.js +227 -0
  222. package/dist/server/handlers/supply-chain.d.ts +19 -0
  223. package/dist/server/handlers/supply-chain.js +327 -0
  224. package/dist/server/handlers/transcription.d.ts +17 -0
  225. package/dist/server/handlers/transcription.js +121 -0
  226. package/dist/server/handlers/video-gen.d.ts +6 -0
  227. package/dist/server/handlers/video-gen.js +466 -0
  228. package/dist/server/handlers/voice.d.ts +8 -0
  229. package/dist/server/handlers/voice.js +1146 -0
  230. package/dist/server/handlers/workflow-steps.d.ts +86 -0
  231. package/dist/server/handlers/workflow-steps.js +2349 -0
  232. package/dist/server/handlers/workflows.d.ts +7 -0
  233. package/dist/server/handlers/workflows.js +989 -0
  234. package/dist/server/index.d.ts +1 -0
  235. package/dist/server/index.js +2427 -0
  236. package/dist/server/lib/batch-client.d.ts +80 -0
  237. package/dist/server/lib/batch-client.js +467 -0
  238. package/dist/server/lib/code-worker-pool.d.ts +31 -0
  239. package/dist/server/lib/code-worker-pool.js +224 -0
  240. package/dist/server/lib/code-worker.d.ts +1 -0
  241. package/dist/server/lib/code-worker.js +188 -0
  242. package/dist/server/lib/compaction-service.d.ts +32 -0
  243. package/dist/server/lib/compaction-service.js +162 -0
  244. package/dist/server/lib/logger.d.ts +19 -0
  245. package/dist/server/lib/logger.js +46 -0
  246. package/dist/server/lib/otel.d.ts +38 -0
  247. package/dist/server/lib/otel.js +126 -0
  248. package/dist/server/lib/pg-rate-limiter.d.ts +21 -0
  249. package/dist/server/lib/pg-rate-limiter.js +86 -0
  250. package/dist/server/lib/prompt-sanitizer.d.ts +37 -0
  251. package/dist/server/lib/prompt-sanitizer.js +177 -0
  252. package/dist/server/lib/provider-capabilities.d.ts +85 -0
  253. package/dist/server/lib/provider-capabilities.js +190 -0
  254. package/dist/server/lib/provider-failover.d.ts +74 -0
  255. package/dist/server/lib/provider-failover.js +210 -0
  256. package/dist/server/lib/rate-limiter.d.ts +39 -0
  257. package/dist/server/lib/rate-limiter.js +147 -0
  258. package/dist/server/lib/server-agent-loop.d.ts +107 -0
  259. package/dist/server/lib/server-agent-loop.js +667 -0
  260. package/dist/server/lib/server-subagent.d.ts +78 -0
  261. package/dist/server/lib/server-subagent.js +203 -0
  262. package/dist/server/lib/session-checkpoint.d.ts +51 -0
  263. package/dist/server/lib/session-checkpoint.js +145 -0
  264. package/dist/server/lib/ssrf-guard.d.ts +13 -0
  265. package/dist/server/lib/ssrf-guard.js +240 -0
  266. package/dist/server/lib/supabase-client.d.ts +7 -0
  267. package/dist/server/lib/supabase-client.js +78 -0
  268. package/dist/server/lib/template-resolver.d.ts +31 -0
  269. package/dist/server/lib/template-resolver.js +215 -0
  270. package/dist/server/lib/utils.d.ts +16 -0
  271. package/dist/server/lib/utils.js +147 -0
  272. package/dist/server/local-agent-gateway.d.ts +82 -0
  273. package/dist/server/local-agent-gateway.js +426 -0
  274. package/dist/server/providers/anthropic.d.ts +20 -0
  275. package/dist/server/providers/anthropic.js +199 -0
  276. package/dist/server/providers/bedrock.d.ts +20 -0
  277. package/dist/server/providers/bedrock.js +194 -0
  278. package/dist/server/providers/gemini.d.ts +24 -0
  279. package/dist/server/providers/gemini.js +486 -0
  280. package/dist/server/providers/openai.d.ts +24 -0
  281. package/dist/server/providers/openai.js +522 -0
  282. package/dist/server/providers/registry.d.ts +32 -0
  283. package/dist/server/providers/registry.js +58 -0
  284. package/dist/server/providers/shared.d.ts +32 -0
  285. package/dist/server/providers/shared.js +124 -0
  286. package/dist/server/providers/types.d.ts +92 -0
  287. package/dist/server/providers/types.js +12 -0
  288. package/dist/server/proxy-handlers.d.ts +6 -0
  289. package/dist/server/proxy-handlers.js +89 -0
  290. package/dist/server/tool-router.d.ts +149 -0
  291. package/dist/server/tool-router.js +803 -0
  292. package/dist/server/validation.d.ts +24 -0
  293. package/dist/server/validation.js +301 -0
  294. package/dist/server/worker.d.ts +19 -0
  295. package/dist/server/worker.js +201 -0
  296. package/dist/setup.d.ts +8 -0
  297. package/dist/setup.js +181 -0
  298. package/dist/shared/agent-core.d.ts +157 -0
  299. package/dist/shared/agent-core.js +534 -0
  300. package/dist/shared/anthropic-types.d.ts +105 -0
  301. package/dist/shared/anthropic-types.js +7 -0
  302. package/dist/shared/api-client.d.ts +90 -0
  303. package/dist/shared/api-client.js +379 -0
  304. package/dist/shared/constants.d.ts +33 -0
  305. package/dist/shared/constants.js +80 -0
  306. package/dist/shared/sse-parser.d.ts +26 -0
  307. package/dist/shared/sse-parser.js +259 -0
  308. package/dist/shared/tool-dispatch.d.ts +52 -0
  309. package/dist/shared/tool-dispatch.js +191 -0
  310. package/dist/shared/types.d.ts +72 -0
  311. package/dist/shared/types.js +7 -0
  312. package/dist/updater.d.ts +25 -0
  313. package/dist/updater.js +140 -0
  314. package/dist/webchat/widget.d.ts +0 -0
  315. package/dist/webchat/widget.js +397 -0
  316. package/package.json +95 -0
  317. package/src/cli/services/builtin-skills/commit.md +19 -0
  318. package/src/cli/services/builtin-skills/review-pr.md +21 -0
  319. package/src/cli/services/builtin-skills/review.md +18 -0
@@ -0,0 +1,2349 @@
1
+ // server/handlers/workflow-steps.ts — Step executor engine
2
+ // Extracted from workflows.ts to separate step execution from workflow CRUD/management.
3
+ //
4
+ // Contains: step type executors, executeAndAdvance, inline chain execution,
5
+ // circuit breakers, code execution (JS/Python), cron parser, schedule/timeout processing,
6
+ // event trigger processing, flow control, webhook ingestion, and all step advancement helpers.
7
+ import { createHmac, timingSafeEqual, randomUUID } from "node:crypto";
8
+ import { resolveTemplate, evaluateCondition } from "../lib/template-resolver.js";
9
+ import { sanitizeError } from "../../shared/agent-core.js";
10
+ import { executeWithPool, initWorkerPool, getPoolStats, shutdownPool } from "../lib/code-worker-pool.js";
11
+ import { batchClient } from "../lib/batch-client.js";
12
+ import { getProvider } from "../../shared/constants.js";
13
+ import { createLogger } from "../lib/logger.js";
14
+ import { startSpan } from "../lib/otel.js";
15
+ const log = createLogger("workflow-steps");
16
+ // ============================================================================
17
+ // CONSTANTS
18
+ // ============================================================================
19
+ const MAX_INLINE_DEPTH = 50;
20
+ const CODE_TIMEOUT_MS = 5000;
21
+ const CODE_OUTPUT_MAX = 102_400; // 100KB
22
+ const MAX_FOR_EACH_ITEMS = 1000; // P2 FIX: Prevent unbounded for_each expansion
23
+ const MAX_PARALLEL_CHILDREN = 100; // P1 FIX: Cap parallel step fan-out
24
+ const GUEST_APPROVAL_SECRET = process.env.GUEST_APPROVAL_SECRET || process.env.FLY_INTERNAL_SECRET || "";
25
+ const GUEST_APPROVAL_BASE_URL = "https://whale-agent.fly.dev/approvals/guest";
26
+ // ============================================================================
27
+ // GUEST APPROVAL — HMAC-signed URLs for unauthenticated approvers
28
+ // ============================================================================
29
+ export function generateGuestApprovalUrl(approvalId, action, expiresAt) {
30
+ if (!GUEST_APPROVAL_SECRET)
31
+ return null; // Guest approvals disabled — no signing secret configured
32
+ const payload = `${approvalId}:${action}:${expiresAt}`;
33
+ const sig = createHmac("sha256", GUEST_APPROVAL_SECRET).update(payload).digest("hex");
34
+ return `${GUEST_APPROVAL_BASE_URL}/${approvalId}?action=${action}&expires=${encodeURIComponent(expiresAt)}&sig=${sig}`;
35
+ }
36
+ export function verifyGuestApprovalSignature(approvalId, action, expiresAt, sig) {
37
+ if (!GUEST_APPROVAL_SECRET)
38
+ return false; // Guest approvals disabled — no signing secret configured
39
+ const payload = `${approvalId}:${action}:${expiresAt}`;
40
+ const expected = createHmac("sha256", GUEST_APPROVAL_SECRET).update(payload).digest("hex");
41
+ try {
42
+ return timingSafeEqual(Buffer.from(sig, "hex"), Buffer.from(expected, "hex"));
43
+ }
44
+ catch {
45
+ return false;
46
+ }
47
+ }
48
+ // ============================================================================
49
+ // EVENT JOURNAL — append-only state transition log
50
+ // ============================================================================
51
+ export async function logWorkflowEvent(supabase, runId, eventType, payload, stepRunId) {
52
+ const { error } = await supabase.from("workflow_events").insert({
53
+ run_id: runId,
54
+ step_run_id: stepRunId || null,
55
+ event_type: eventType,
56
+ payload,
57
+ });
58
+ if (error)
59
+ log.warn({ err: error.message, runId, eventType }, "logWorkflowEvent insert failed");
60
+ }
61
+ // ============================================================================
62
+ // FLOW CONTROL — concurrency + rate limiting at step level
63
+ // ============================================================================
64
+ async function checkFlowControl(supabase, step) {
65
+ const config = step.step_config;
66
+ // Per-step concurrency limit
67
+ // P0 FIX: Filter by store_id to prevent cross-tenant data access in flow control decisions
68
+ const concurrencyLimit = config.concurrency_limit;
69
+ if (concurrencyLimit && concurrencyLimit > 0) {
70
+ const concurrencyKey = config.concurrency_key || step.step_key;
71
+ const { count } = await supabase.from("workflow_step_runs")
72
+ .select("id", { count: "exact", head: true })
73
+ .eq("step_key", concurrencyKey)
74
+ .eq("status", "running")
75
+ .eq("store_id", step.store_id)
76
+ .neq("id", step.step_run_id); // exclude self
77
+ if ((count || 0) >= concurrencyLimit) {
78
+ return { allowed: false, reason: `Concurrency limit ${concurrencyLimit} reached for '${concurrencyKey}'` };
79
+ }
80
+ }
81
+ // Per-step rate limit (max N executions per window)
82
+ // P0 FIX: Filter by store_id to prevent cross-tenant data access in flow control decisions
83
+ const rateLimit = config.rate_limit;
84
+ const rateWindowSec = config.rate_window_seconds || 60;
85
+ if (rateLimit && rateLimit > 0) {
86
+ const windowStart = new Date(Date.now() - rateWindowSec * 1000).toISOString();
87
+ const { count } = await supabase.from("workflow_step_runs")
88
+ .select("id", { count: "exact", head: true })
89
+ .eq("step_key", step.step_key)
90
+ .eq("store_id", step.store_id)
91
+ .in("status", ["success", "running"])
92
+ .gte("started_at", windowStart);
93
+ if ((count || 0) >= rateLimit) {
94
+ return { allowed: false, reason: `Rate limit ${rateLimit}/${rateWindowSec}s reached for '${step.step_key}'` };
95
+ }
96
+ }
97
+ return { allowed: true };
98
+ }
99
+ let _executeTool = null;
100
+ let _runAgentQuery = null;
101
+ let _broadcastToken = null;
102
+ let _broadcastStepError = null;
103
+ export function setToolExecutor(fn) { _executeTool = fn; }
104
+ export function setAgentExecutor(fn) { _runAgentQuery = fn; }
105
+ export function setTokenBroadcaster(fn) { _broadcastToken = fn; }
106
+ export function setStepErrorBroadcaster(fn) { _broadcastStepError = fn; }
107
+ /** Broadcast a step error to SSE clients and persist error_details on the step run. */
108
+ async function surfaceStepError(supabase, step, errorMessage) {
109
+ const timestamp = new Date().toISOString();
110
+ // 1. Persist structured error_details on the step run record
111
+ await supabase.from("workflow_step_runs").update({
112
+ error_details: {
113
+ step_name: step.step_key,
114
+ step_type: step.step_type,
115
+ error_message: errorMessage,
116
+ timestamp,
117
+ },
118
+ }).eq("id", step.step_run_id);
119
+ // 2. Broadcast via SSE so connected clients see the error in real time
120
+ if (_broadcastStepError) {
121
+ _broadcastStepError(step.run_id, {
122
+ type: "workflow_error",
123
+ workflow_id: step.workflow_id,
124
+ run_id: step.run_id,
125
+ step_name: step.step_key,
126
+ step_type: step.step_type,
127
+ error: errorMessage,
128
+ timestamp,
129
+ });
130
+ }
131
+ }
132
+ // ============================================================================
133
+ // STEP EXECUTORS
134
+ // ============================================================================
135
+ async function executeToolStep(supabase, config, ctx, storeId, traceId) {
136
+ if (!_executeTool)
137
+ return { success: false, error: "Tool executor not initialized" };
138
+ const toolName = config.tool_name;
139
+ if (!toolName)
140
+ return { success: false, error: "No tool_name in step config" };
141
+ const argsTemplate = (config.args_template || config.args || {});
142
+ const resolvedArgs = resolveTemplate(argsTemplate, ctx);
143
+ // For email steps: auto-inject template_data from workflow context so {{variable}} placeholders resolve
144
+ if (toolName === "email" && (resolvedArgs.action === "send" || resolvedArgs.action === "send_template") && !resolvedArgs.template_data) {
145
+ const mergedData = { ...(ctx.trigger || {}) };
146
+ for (const [, stepData] of Object.entries(ctx.steps || {})) {
147
+ if (stepData?.output && typeof stepData.output === "object") {
148
+ Object.assign(mergedData, stepData.output);
149
+ }
150
+ }
151
+ if (Object.keys(mergedData).length > 0) {
152
+ resolvedArgs.template_data = mergedData;
153
+ }
154
+ }
155
+ if (config.tool_id) {
156
+ const cb = await checkToolCircuitBreaker(supabase, config.tool_id);
157
+ if (!cb.allowed)
158
+ return { success: false, error: cb.reason };
159
+ }
160
+ const result = await _executeTool(supabase, toolName, resolvedArgs, storeId, traceId);
161
+ if (config.tool_id)
162
+ await updateToolCircuitBreaker(supabase, config.tool_id, result.success, result.error);
163
+ return result.success
164
+ ? { success: true, output: result.data }
165
+ : { success: false, error: result.error };
166
+ }
167
+ function executeConditionStep(config, ctx) {
168
+ const expression = config.expression;
169
+ if (!expression)
170
+ return { success: false, error: "No expression in condition step" };
171
+ if (!config.on_true && !config.on_false) {
172
+ return { success: false, output: { error: "Condition step must have at least on_true or on_false defined" } };
173
+ }
174
+ const result = evaluateCondition(expression, ctx);
175
+ const branch = result ? (config.on_true || undefined) : (config.on_false || undefined);
176
+ return { success: true, output: { condition_result: result, branch }, branch };
177
+ }
178
+ function executeTransformStep(config, ctx) {
179
+ const mapping = config.mapping;
180
+ if (!mapping)
181
+ return { success: false, error: "No mapping in transform step" };
182
+ return { success: true, output: resolveTemplate(mapping, ctx) };
183
+ }
184
+ async function executeAgentStep(config, ctx, storeId, supabase, step, traceId) {
185
+ if (!_runAgentQuery)
186
+ return { success: false, error: "Agent executor not initialized" };
187
+ const agentId = config.agent_id;
188
+ if (!agentId)
189
+ return { success: false, error: "No agent_id in agent step config" };
190
+ const promptTemplate = (config.prompt_template || config.prompt || "");
191
+ const prompt = resolveTemplate(promptTemplate, ctx);
192
+ if (!prompt)
193
+ return { success: false, error: "No prompt resolved for agent step" };
194
+ // AI tool gating — inject allowed/blocked tool lists into prompt
195
+ const allowedTools = config.allowed_tools;
196
+ const blockedTools = config.blocked_tools;
197
+ const requireApprovalTools = config.require_approval_tools;
198
+ let gatedPrompt = prompt;
199
+ if (allowedTools?.length) {
200
+ gatedPrompt += `\n\n[SYSTEM: You may ONLY use these tools: ${allowedTools.join(", ")}. Refuse any other tool calls.]`;
201
+ }
202
+ if (blockedTools?.length) {
203
+ gatedPrompt += `\n\n[SYSTEM: You must NEVER use these tools: ${blockedTools.join(", ")}. Use alternatives instead.]`;
204
+ }
205
+ if (requireApprovalTools?.length && step) {
206
+ // Check if approval was already given (stored in step input from approval step)
207
+ const approvedTools = step.input?.approved_tools;
208
+ const pendingTools = requireApprovalTools.filter(t => !approvedTools?.includes(t));
209
+ if (pendingTools.length > 0) {
210
+ gatedPrompt += `\n\n[SYSTEM: The following tools require human approval before use: ${pendingTools.join(", ")}. Do NOT call them — describe what you would do and why, then stop.]`;
211
+ }
212
+ }
213
+ const maxTurns = config.max_turns || 5;
214
+ const useBatch = config.use_batch === true;
215
+ // Batch mode: single-turn LLM call via Batch API for ~50% cost savings.
216
+ // Only valid when no tool loop is needed (the batch API doesn't support agentic tool loops).
217
+ if (useBatch && maxTurns <= 1) {
218
+ const model = config.model || "claude-sonnet-4-6";
219
+ const provider = getProvider(model);
220
+ const batchProvider = (provider === "openai") ? "openai" : "anthropic";
221
+ const requestId = `wf_agent_${randomUUID().replace(/-/g, "").slice(0, 12)}`;
222
+ try {
223
+ const batchResult = await batchClient.processSingle(requestId, batchProvider, model, [{ role: "user", content: gatedPrompt }], { max_tokens: config.max_tokens || 4096, temperature: config.temperature });
224
+ return batchResult.success
225
+ ? { success: true, output: { response: batchResult.text || "", usage: batchResult.usage, batch: true } }
226
+ : { success: false, error: batchResult.error || "Batch agent request failed" };
227
+ }
228
+ catch (err) {
229
+ return { success: false, error: sanitizeError(err) };
230
+ }
231
+ }
232
+ // Wire up token broadcasting for SSE streaming to connected clients
233
+ const onToken = step && _broadcastToken
234
+ ? (token) => _broadcastToken(step.run_id, step.step_key, token)
235
+ : undefined;
236
+ const result = await _runAgentQuery(supabase, agentId, gatedPrompt, storeId, maxTurns, onToken, traceId);
237
+ return result.success
238
+ ? { success: true, output: { response: result.response } }
239
+ : { success: false, error: result.error };
240
+ }
241
+ // P1 FIX: Use shared SSRF guard module (enhanced with DNS resolve-then-check, IPv6-mapped, CGNAT)
242
+ import { validateUrl } from "../lib/ssrf-guard.js";
243
+ async function executeWebhookOutStep(config, ctx) {
244
+ const url = resolveTemplate(config.url, ctx);
245
+ if (!url)
246
+ return { success: false, error: "No URL in webhook_out step" };
247
+ // P0 FIX: Use async validateUrl (DNS resolve-then-check) instead of sync isBlockedUrl
248
+ const ssrfError = await validateUrl(url);
249
+ if (ssrfError)
250
+ return { success: false, error: `Blocked: ${ssrfError}` };
251
+ const method = (config.method || "POST").toUpperCase();
252
+ const headers = {};
253
+ if (config.headers && typeof config.headers === "object") {
254
+ for (const [k, v] of Object.entries(config.headers)) {
255
+ headers[k] = resolveTemplate(v, ctx);
256
+ }
257
+ }
258
+ let body;
259
+ if (method !== "GET" && method !== "HEAD") {
260
+ const bodyTemplate = config.body_template || {};
261
+ body = JSON.stringify(resolveTemplate(bodyTemplate, ctx));
262
+ if (!headers["Content-Type"])
263
+ headers["Content-Type"] = "application/json";
264
+ }
265
+ if (config.hmac_secret && body) {
266
+ const hmac = createHmac("sha256", config.hmac_secret).update(body).digest("hex");
267
+ headers["X-Webhook-Signature"] = `sha256=${hmac}`;
268
+ }
269
+ try {
270
+ const controller = new AbortController();
271
+ const timer = setTimeout(() => controller.abort(), 30_000);
272
+ const resp = await fetch(url, { method, headers, body, signal: controller.signal });
273
+ clearTimeout(timer);
274
+ const ct = resp.headers.get("content-type") || "";
275
+ const data = ct.includes("json") ? await resp.json() : await resp.text();
276
+ if (!resp.ok)
277
+ return { success: false, error: `HTTP ${resp.status}: ${String(data).substring(0, 500)}` };
278
+ return { success: true, output: { status: resp.status, data } };
279
+ }
280
+ catch (err) {
281
+ if (err.name === "AbortError")
282
+ return { success: false, error: "Webhook request timed out" };
283
+ return { success: false, error: sanitizeError(err) };
284
+ }
285
+ }
286
+ function executeNoopStep() {
287
+ return { success: true, output: { noop: true } };
288
+ }
289
+ // ============================================================================
290
+ // BATCH LLM STEP — non-streaming LLM via Batch API (~50% cost savings)
291
+ // ============================================================================
292
+ /**
293
+ * Execute an LLM request via the Batch API instead of streaming.
294
+ * Designed for workflow steps that don't need real-time token streaming.
295
+ *
296
+ * Step config:
297
+ * model: string — model ID (e.g. "claude-sonnet-4-6", "gpt-5-mini")
298
+ * prompt: string — prompt template (resolved with ctx)
299
+ * system: string — optional system prompt template
300
+ * max_tokens: number — optional, default 4096
301
+ * temperature: number — optional
302
+ * tools: array — optional tool definitions for the LLM
303
+ */
304
+ async function executeLlmBatchStep(config, ctx) {
305
+ const model = config.model;
306
+ if (!model)
307
+ return { success: false, error: "No model in llm_batch step config" };
308
+ const promptTemplate = (config.prompt_template || config.prompt || "");
309
+ const prompt = resolveTemplate(promptTemplate, ctx);
310
+ if (!prompt)
311
+ return { success: false, error: "No prompt resolved for llm_batch step" };
312
+ const systemTemplate = config.system;
313
+ const system = systemTemplate ? resolveTemplate(systemTemplate, ctx) : undefined;
314
+ const maxTokens = config.max_tokens || 4096;
315
+ const temperature = config.temperature;
316
+ const tools = config.tools;
317
+ // Determine provider from model ID
318
+ const provider = getProvider(model);
319
+ const batchProvider = (provider === "openai") ? "openai" : "anthropic";
320
+ const requestId = `wf_${randomUUID().replace(/-/g, "").slice(0, 16)}`;
321
+ try {
322
+ const result = await batchClient.processSingle(requestId, batchProvider, model, [{ role: "user", content: prompt }], { system, tools, max_tokens: maxTokens, temperature });
323
+ if (result.success) {
324
+ return {
325
+ success: true,
326
+ output: {
327
+ response: result.text || "",
328
+ content: result.content,
329
+ usage: result.usage,
330
+ batch_request_id: requestId,
331
+ },
332
+ };
333
+ }
334
+ else {
335
+ return { success: false, error: result.error || "Batch LLM request failed" };
336
+ }
337
+ }
338
+ catch (err) {
339
+ return { success: false, error: sanitizeError(err) };
340
+ }
341
+ }
342
+ // ============================================================================
343
+ // PHASE 2: APPROVAL STEP EXECUTOR
344
+ // ============================================================================
345
+ async function executeApprovalStep(supabase, step, ctx) {
346
+ const config = step.step_config;
347
+ // Second pass — step was resumed after approval response
348
+ if (step.input && typeof step.input === "object" && step.input.approval_status) {
349
+ const approvalData = step.input;
350
+ const isApproved = approvalData.approval_status === "approved" || approvalData.approval_status === "approve";
351
+ return {
352
+ success: true,
353
+ output: {
354
+ approved: isApproved,
355
+ status: approvalData.approval_status,
356
+ response_data: approvalData.approval_data,
357
+ responded_by: approvalData.responded_by,
358
+ },
359
+ branch: isApproved ? config.on_approve : config.on_reject,
360
+ };
361
+ }
362
+ // First pass — create approval request and wait
363
+ const title = resolveTemplate((config.title || "Approval Required"), ctx);
364
+ const description = config.description ? resolveTemplate(config.description, ctx) : null;
365
+ const prompt = config.prompt ? resolveTemplate(config.prompt, ctx) : null;
366
+ const options = config.options || ["approve", "reject"];
367
+ const timeoutSeconds = config.timeout_seconds || 86400;
368
+ const timeoutAction = config.timeout_action || "fail";
369
+ const channels = config.notification_channels || ["push"];
370
+ const expiresAt = new Date(Date.now() + timeoutSeconds * 1000).toISOString();
371
+ await supabase.from("workflow_approval_requests").insert({
372
+ store_id: step.store_id,
373
+ run_id: step.run_id,
374
+ step_run_id: step.step_run_id,
375
+ workflow_id: step.workflow_id,
376
+ title,
377
+ description,
378
+ prompt,
379
+ options,
380
+ form_schema: config.form_schema || null,
381
+ assigned_to: config.assigned_to || null,
382
+ assigned_role: config.assigned_role || null,
383
+ expires_at: expiresAt,
384
+ timeout_action: timeoutAction,
385
+ notification_channels: channels,
386
+ });
387
+ // Generate guest approval URLs (signed, no auth required) — only if signing secret is configured
388
+ const guestUrls = {};
389
+ const optionsList = Array.isArray(options) ? options : ["approve", "reject"];
390
+ for (const opt of optionsList) {
391
+ const url = generateGuestApprovalUrl(step.step_run_id, opt, expiresAt);
392
+ if (url)
393
+ guestUrls[opt] = url;
394
+ }
395
+ // Set step to waiting
396
+ await supabase.from("workflow_step_runs").update({
397
+ status: "waiting",
398
+ output: { waiting_for: "approval", title, expires_at: expiresAt, guest_urls: guestUrls },
399
+ }).eq("id", step.step_run_id);
400
+ return "waiting";
401
+ }
402
+ // ============================================================================
403
+ // PHASE 7: ENHANCED CODE EXECUTION
404
+ // ============================================================================
405
+ // Re-export pool management for index.ts to initialize on startup
406
+ export { initWorkerPool, getPoolStats, shutdownPool };
407
+ // ============================================================================
408
+ // CRON EXPRESSION PARSER — 5-field (min hour dom mon dow)
409
+ // No external dependencies. Supports: *, */N, N-M, N,M, N
410
+ // ============================================================================
411
+ function parseCronField(field, min, max) {
412
+ const values = new Set();
413
+ for (const part of field.split(",")) {
414
+ const trimmed = part.trim();
415
+ if (trimmed === "*") {
416
+ for (let i = min; i <= max; i++)
417
+ values.add(i);
418
+ }
419
+ else if (trimmed.includes("/")) {
420
+ const [range, stepStr] = trimmed.split("/");
421
+ const step = parseInt(stepStr, 10);
422
+ if (isNaN(step) || step <= 0)
423
+ continue;
424
+ let start = min, end = max;
425
+ if (range !== "*") {
426
+ if (range.includes("-")) {
427
+ [start, end] = range.split("-").map(Number);
428
+ }
429
+ else {
430
+ start = parseInt(range, 10);
431
+ }
432
+ }
433
+ for (let i = start; i <= end; i += step)
434
+ values.add(i);
435
+ }
436
+ else if (trimmed.includes("-")) {
437
+ const [s, e] = trimmed.split("-").map(Number);
438
+ for (let i = s; i <= e; i++)
439
+ values.add(i);
440
+ }
441
+ else {
442
+ const n = parseInt(trimmed, 10);
443
+ if (!isNaN(n) && n >= min && n <= max)
444
+ values.add(n);
445
+ }
446
+ }
447
+ return [...values].sort((a, b) => a - b);
448
+ }
449
+ /**
450
+ * Compute the next occurrence of a 5-field cron expression after `after`.
451
+ * Returns null if expression is invalid or no match found within 366 days.
452
+ */
453
+ /**
454
+ * Get the UTC offset in minutes for an IANA timezone at a specific instant.
455
+ * Uses Intl.DateTimeFormat.formatToParts to extract wall-clock components,
456
+ * then diffs against the UTC components of the same instant. No string parsing.
457
+ */
458
+ function getUtcOffsetMinutes(date, tz) {
459
+ // Extract wall-clock parts in the target timezone
460
+ const fmt = new Intl.DateTimeFormat("en-US", {
461
+ timeZone: tz, year: "numeric", month: "numeric", day: "numeric",
462
+ hour: "numeric", minute: "numeric", second: "numeric", hour12: false,
463
+ });
464
+ const p = Object.fromEntries(fmt.formatToParts(date).map(x => [x.type, parseInt(x.value, 10)]));
465
+ // Build a pseudo-UTC timestamp from the wall-clock parts
466
+ const wallMs = Date.UTC(p.year, p.month - 1, p.day, p.hour % 24, p.minute, p.second);
467
+ // The offset is how far ahead the wall clock is from the actual UTC instant
468
+ return (wallMs - date.getTime()) / 60_000;
469
+ }
470
+ export function getNextCronTime(expression, after = new Date(), timezone) {
471
+ const parts = expression.trim().split(/\s+/);
472
+ if (parts.length !== 5)
473
+ return null;
474
+ const minutes = parseCronField(parts[0], 0, 59);
475
+ const hours = parseCronField(parts[1], 0, 23);
476
+ const doms = parseCronField(parts[2], 1, 31);
477
+ const months = parseCronField(parts[3], 1, 12);
478
+ const dows = parseCronField(parts[4], 0, 6); // 0=Sunday
479
+ if (!minutes.length || !hours.length || !doms.length || !months.length || !dows.length)
480
+ return null;
481
+ // DOM/DOW: POSIX semantics — when both are restricted, match EITHER (OR).
482
+ // When only one is restricted, match only that one.
483
+ const domRestricted = parts[2] !== "*";
484
+ const dowRestricted = parts[4] !== "*";
485
+ const useDomDowOr = domRestricted && dowRestricted;
486
+ // Validate timezone
487
+ let effectiveTz = "UTC";
488
+ if (timezone) {
489
+ try {
490
+ Intl.DateTimeFormat(undefined, { timeZone: timezone });
491
+ effectiveTz = timezone;
492
+ }
493
+ catch { /* invalid tz, stay UTC */ }
494
+ }
495
+ // Strategy: work in "local time" coordinates using a fake Date whose UTC fields
496
+ // represent the wall-clock time in the target timezone. This lets us use the fast
497
+ // jump-by-month/day/hour logic. Once we find a match, we convert back to real UTC.
498
+ //
499
+ // The offset is recomputed each time we cross a day boundary to handle DST transitions.
500
+ let offsetMin = effectiveTz === "UTC" ? 0 : getUtcOffsetMinutes(after, effectiveTz);
501
+ const localEpoch = after.getTime() + offsetMin * 60_000;
502
+ const candidate = new Date(localEpoch);
503
+ candidate.setUTCSeconds(0, 0);
504
+ candidate.setUTCMinutes(candidate.getUTCMinutes() + 1); // 1 minute after `after`
505
+ const maxMs = after.getTime() + 366 * 86_400_000;
506
+ let lastOffsetDay = candidate.getUTCDate(); // Track day for offset recomputation
507
+ // Iterate in local-time coordinates (fast jumps — same algorithm as before)
508
+ while (true) {
509
+ // Recompute offset on day boundaries to handle DST transitions correctly
510
+ const currentDay = candidate.getUTCDate();
511
+ if (effectiveTz !== "UTC" && currentDay !== lastOffsetDay) {
512
+ // Convert current candidate back to approximate UTC, then get fresh offset
513
+ const approxUtc = new Date(candidate.getTime() - offsetMin * 60_000);
514
+ const newOffset = getUtcOffsetMinutes(approxUtc, effectiveTz);
515
+ if (newOffset !== offsetMin) {
516
+ // DST changed — adjust candidate to maintain correct local-time coordinates
517
+ const drift = (newOffset - offsetMin) * 60_000;
518
+ candidate.setTime(candidate.getTime() + drift);
519
+ offsetMin = newOffset;
520
+ }
521
+ lastOffsetDay = candidate.getUTCDate();
522
+ }
523
+ // Safety: check if we've exceeded 366 days
524
+ const realUtc = candidate.getTime() - offsetMin * 60_000;
525
+ if (realUtc > maxMs)
526
+ return null;
527
+ const mo = candidate.getUTCMonth() + 1;
528
+ const day = candidate.getUTCDate();
529
+ const hr = candidate.getUTCHours();
530
+ const mi = candidate.getUTCMinutes();
531
+ const dow = candidate.getUTCDay();
532
+ if (!months.includes(mo)) {
533
+ candidate.setUTCMonth(candidate.getUTCMonth() + 1, 1);
534
+ candidate.setUTCHours(0, 0, 0, 0);
535
+ continue;
536
+ }
537
+ const domMatch = doms.includes(day);
538
+ const dowMatch = dows.includes(dow);
539
+ const dayMatch = useDomDowOr ? (domMatch || dowMatch) : (domMatch && dowMatch);
540
+ if (!dayMatch) {
541
+ candidate.setUTCDate(candidate.getUTCDate() + 1);
542
+ candidate.setUTCHours(0, 0, 0, 0);
543
+ continue;
544
+ }
545
+ if (!hours.includes(hr)) {
546
+ candidate.setUTCHours(candidate.getUTCHours() + 1, 0, 0, 0);
547
+ continue;
548
+ }
549
+ if (!minutes.includes(mi)) {
550
+ candidate.setUTCMinutes(candidate.getUTCMinutes() + 1, 0, 0);
551
+ continue;
552
+ }
553
+ // Match found in local coordinates — convert back to real UTC.
554
+ if (effectiveTz === "UTC")
555
+ return candidate;
556
+ const result = new Date(candidate.getTime() - offsetMin * 60_000);
557
+ // Verify: the result in UTC should map back to the same local time we matched.
558
+ // DST spring-forward (gap): 2:00-3:00 doesn't exist → verifyOffset differs → we skip.
559
+ // DST fall-back (ambiguity): 1:00-2:00 exists twice → first occurrence wins (same as cronie).
560
+ const verifyOffset = getUtcOffsetMinutes(result, effectiveTz);
561
+ if (verifyOffset !== offsetMin) {
562
+ candidate.setUTCMinutes(candidate.getUTCMinutes() + 1, 0, 0);
563
+ continue;
564
+ }
565
+ return result;
566
+ }
567
+ }
568
+ // ============================================================================
569
+ // SCHEDULE TRIGGER PROCESSING — fires due cron workflows
570
+ // ============================================================================
571
+ export async function processScheduleTriggers(supabase) {
572
+ // Find workflows that are past due — supports both cron (recurring) and one-time (run_at)
573
+ const { data: dueWorkflows } = await supabase.from("workflows")
574
+ .select("id, store_id, cron_expression, timezone")
575
+ .not("next_run_at", "is", null)
576
+ .lte("next_run_at", new Date().toISOString())
577
+ .eq("is_active", true)
578
+ .eq("status", "active")
579
+ .limit(10);
580
+ if (!dueWorkflows?.length)
581
+ return 0;
582
+ let fired = 0;
583
+ for (const wf of dueWorkflows) {
584
+ try {
585
+ const isOneTime = !wf.cron_expression;
586
+ // Start the run
587
+ const { data: result, error } = await supabase.rpc("start_workflow_run", {
588
+ p_workflow_id: wf.id,
589
+ p_store_id: wf.store_id,
590
+ p_trigger_type: "schedule",
591
+ p_trigger_payload: {
592
+ cron: wf.cron_expression || null,
593
+ one_time: isOneTime,
594
+ scheduled_at: new Date().toISOString(),
595
+ },
596
+ p_idempotency_key: wf.cron_expression
597
+ ? `schedule:${wf.id}:${new Date().toISOString().slice(0, 16)}` // Minute-granularity dedup for cron
598
+ : `schedule:${wf.id}:one_time`,
599
+ });
600
+ if (error || !result?.success) {
601
+ log.error({ workflowId: wf.id, err: error?.message || result?.error }, "failed to start scheduled workflow");
602
+ // Still update next_run_at to prevent infinite retries
603
+ }
604
+ else {
605
+ fired++;
606
+ // Generate trace_id for the new run
607
+ const traceId = randomUUID();
608
+ await supabase.from("workflow_runs").update({ trace_id: traceId }).eq("id", result.run_id);
609
+ // Inline execution
610
+ try {
611
+ await executeInlineChain(supabase, result.run_id);
612
+ }
613
+ catch (err) {
614
+ log.error({ runId: result.run_id, err: err.message }, "inline chain failed for scheduled run");
615
+ }
616
+ }
617
+ if (isOneTime) {
618
+ // One-time schedule: clear next_run_at and deactivate
619
+ await supabase.from("workflows").update({
620
+ last_scheduled_at: new Date().toISOString(),
621
+ next_run_at: null,
622
+ is_active: false,
623
+ status: "paused",
624
+ }).eq("id", wf.id);
625
+ }
626
+ else {
627
+ // Recurring cron: compute next run time
628
+ const nextRun = getNextCronTime(wf.cron_expression, new Date(), wf.timezone || undefined);
629
+ await supabase.from("workflows").update({
630
+ last_scheduled_at: new Date().toISOString(),
631
+ next_run_at: nextRun?.toISOString() || null,
632
+ }).eq("id", wf.id);
633
+ }
634
+ }
635
+ catch (err) {
636
+ log.error({ workflowId: wf.id, err: sanitizeError(err) }, "error processing scheduled workflow");
637
+ }
638
+ }
639
+ return fired;
640
+ }
641
+ // ============================================================================
642
+ // WORKFLOW TIMEOUT ENFORCEMENT — cancel overtime runs
643
+ // ============================================================================
644
+ const DEFAULT_MAX_RUN_DURATION_SEC = 3600; // 1 hour hard ceiling for any workflow run
645
+ export async function enforceWorkflowTimeouts(supabase) {
646
+ // Find running workflows that exceeded their duration limit
647
+ const { data: timedOut } = await supabase.from("workflow_runs")
648
+ .select("id, workflow_id, store_id, started_at, workflows!inner(max_run_duration_seconds, name)")
649
+ .eq("status", "running")
650
+ .not("started_at", "is", null)
651
+ .limit(50);
652
+ if (!timedOut?.length)
653
+ return 0;
654
+ let count = 0;
655
+ const now = Date.now();
656
+ for (const run of timedOut) {
657
+ const wf = run.workflows;
658
+ // Use configured max_duration or fall back to the hard ceiling
659
+ const maxDuration = (wf?.max_run_duration_seconds && wf.max_run_duration_seconds > 0)
660
+ ? wf.max_run_duration_seconds
661
+ : DEFAULT_MAX_RUN_DURATION_SEC;
662
+ const elapsed = now - new Date(run.started_at).getTime();
663
+ if (elapsed < maxDuration * 1000)
664
+ continue;
665
+ // This run has timed out
666
+ await completeWorkflowRun(supabase, run.id, run.workflow_id, run.store_id, "timed_out", `Workflow exceeded max duration of ${maxDuration}s (ran for ${Math.round(elapsed / 1000)}s)`);
667
+ // Archive to DLQ
668
+ await archiveToDlq(supabase, run.id, run.workflow_id, run.store_id, wf?.name);
669
+ count++;
670
+ log.warn({ runId: run.id, elapsedSec: Math.round(elapsed / 1000), maxDuration }, "workflow run timed out");
671
+ }
672
+ // Detect zombie runs: "running" with no active steps (all steps are terminal)
673
+ const { data: zombieRuns } = await supabase.from("workflow_runs")
674
+ .select("id, workflow_id, store_id, started_at, workflows!inner(name)")
675
+ .eq("status", "running")
676
+ .not("started_at", "is", null)
677
+ .limit(50);
678
+ if (zombieRuns?.length) {
679
+ for (const run of zombieRuns) {
680
+ const elapsed = now - new Date(run.started_at).getTime();
681
+ if (elapsed < 120_000)
682
+ continue; // Only check runs older than 2 min
683
+ const { data: activeSteps } = await supabase.from("workflow_step_runs")
684
+ .select("id").eq("run_id", run.id)
685
+ .in("status", ["pending", "running", "retrying", "waiting"]).limit(1);
686
+ if (!activeSteps?.length) {
687
+ // No active steps but run is still "running" — finalize it
688
+ await checkWorkflowCompletion(supabase, run.id, run.workflow_id);
689
+ count++;
690
+ log.warn({ runId: run.id, elapsedSec: Math.round(elapsed / 1000) }, "finalized zombie workflow run");
691
+ }
692
+ }
693
+ }
694
+ return count;
695
+ }
696
+ // ============================================================================
697
+ // ORPHANED STEP CLEANUP — cancel step_runs whose parent run is terminal
698
+ // ============================================================================
699
+ let lastOrphanCleanupAt = 0;
700
+ const ORPHAN_CLEANUP_INTERVAL_MS = 60_000; // Run at most once per minute
701
+ export async function cleanupOrphanedSteps(supabase) {
702
+ if (Date.now() - lastOrphanCleanupAt < ORPHAN_CLEANUP_INTERVAL_MS)
703
+ return 0;
704
+ lastOrphanCleanupAt = Date.now();
705
+ // Find step_runs in non-terminal status whose parent run IS terminal.
706
+ // The !inner join + .in filter on the JOINED table ensures Postgres only returns
707
+ // rows where the run status is already terminal — no in-app filtering needed.
708
+ const { data: orphans } = await supabase.from("workflow_step_runs")
709
+ .select("id, run_id, step_key, status, workflow_runs!workflow_step_runs_run_id_fkey!inner(status)")
710
+ .in("status", ["pending", "retrying", "waiting"])
711
+ .in("workflow_runs.status", ["success", "failed", "cancelled", "timed_out"])
712
+ .limit(100);
713
+ if (!orphans?.length)
714
+ return 0;
715
+ let cleaned = 0;
716
+ for (const step of orphans) {
717
+ const runStatus = step.workflow_runs?.status;
718
+ await supabase.from("workflow_step_runs").update({
719
+ status: "cancelled",
720
+ error_message: `Orphaned: parent run already ${runStatus}`,
721
+ completed_at: new Date().toISOString(),
722
+ }).eq("id", step.id);
723
+ cleaned++;
724
+ }
725
+ if (cleaned > 0) {
726
+ log.info({ cleaned }, "cleaned up orphaned step runs");
727
+ }
728
+ return cleaned;
729
+ }
730
+ // ============================================================================
731
+ // DLQ RETRY MECHANISM — retry transient DLQ failures
732
+ // ============================================================================
733
+ let lastDlqRetryAt = 0;
734
+ const DLQ_RETRY_INTERVAL_MS = 60_000; // Run at most once per minute
735
+ const DLQ_RETRY_BATCH_SIZE = 5;
736
+ const DLQ_MAX_RETRY_ATTEMPTS = 3;
737
+ export async function processDlqRetries(supabase) {
738
+ // Throttle: only run once per minute
739
+ if (Date.now() - lastDlqRetryAt < DLQ_RETRY_INTERVAL_MS)
740
+ return 0;
741
+ lastDlqRetryAt = Date.now();
742
+ // Fetch retryable DLQ entries: transient errors (timeout, network), not yet exhausted
743
+ const { data: entries } = await supabase.from("workflow_dlq")
744
+ .select("id, workflow_id, store_id, trigger_type, trigger_payload, error_message, retry_count, last_retry_at")
745
+ .or("error_message.ilike.%timed out%,error_message.ilike.%timeout%,error_message.ilike.%network%,error_message.ilike.%ECONNREFUSED%,error_message.ilike.%fetch failed%")
746
+ .eq("status", "pending")
747
+ .lt("retry_count", DLQ_MAX_RETRY_ATTEMPTS)
748
+ .order("created_at", { ascending: true })
749
+ .limit(DLQ_RETRY_BATCH_SIZE);
750
+ if (!entries?.length)
751
+ return 0;
752
+ let retried = 0;
753
+ const now = Date.now();
754
+ for (const entry of entries) {
755
+ try {
756
+ // Enforce per-entry exponential backoff: 2min, 4min, 8min
757
+ if (entry.last_retry_at) {
758
+ const backoffMs = Math.pow(2, entry.retry_count || 0) * 120_000;
759
+ const elapsed = now - new Date(entry.last_retry_at).getTime();
760
+ if (elapsed < backoffMs)
761
+ continue; // Too soon for this entry
762
+ }
763
+ const { data: result, error } = await supabase.rpc("start_workflow_run", {
764
+ p_workflow_id: entry.workflow_id,
765
+ p_store_id: entry.store_id,
766
+ p_trigger_type: entry.trigger_type || "dlq_retry",
767
+ p_trigger_payload: { ...(entry.trigger_payload || {}), _dlq_retry: true, _dlq_entry_id: entry.id },
768
+ p_idempotency_key: `dlq_retry:${entry.id}:${(entry.retry_count || 0) + 1}`,
769
+ });
770
+ if (error || !result?.success) {
771
+ // Update retry count but keep in DLQ
772
+ await supabase.from("workflow_dlq").update({
773
+ retry_count: (entry.retry_count || 0) + 1,
774
+ last_error: error?.message || result?.error || "retry failed",
775
+ last_retry_at: new Date().toISOString(),
776
+ }).eq("id", entry.id);
777
+ continue;
778
+ }
779
+ // Success — mark DLQ entry as retried
780
+ await supabase.from("workflow_dlq").update({
781
+ status: "retried",
782
+ retry_count: (entry.retry_count || 0) + 1,
783
+ retried_run_id: result.run_id,
784
+ last_retry_at: new Date().toISOString(),
785
+ }).eq("id", entry.id);
786
+ // Execute inline
787
+ if (result.run_id) {
788
+ const traceId = randomUUID();
789
+ await supabase.from("workflow_runs").update({ trace_id: traceId }).eq("id", result.run_id);
790
+ try {
791
+ await executeInlineChain(supabase, result.run_id);
792
+ }
793
+ catch (err) {
794
+ log.error({ runId: result.run_id, err: err.message }, "inline chain failed for DLQ retry");
795
+ }
796
+ }
797
+ retried++;
798
+ log.info({ dlqEntryId: entry.id, workflowId: entry.workflow_id, retryCount: (entry.retry_count || 0) + 1 }, "DLQ entry retried");
799
+ }
800
+ catch (err) {
801
+ log.warn({ dlqEntryId: entry.id, err: sanitizeError(err) }, "DLQ retry error");
802
+ await supabase.from("workflow_dlq").update({
803
+ retry_count: (entry.retry_count || 0) + 1,
804
+ last_retry_at: new Date().toISOString(),
805
+ last_error: sanitizeError(err),
806
+ }).eq("id", entry.id);
807
+ }
808
+ }
809
+ return retried;
810
+ }
811
+ // ============================================================================
812
+ // EVENT TRIGGER PROCESSING — match inbound events to workflow subscriptions
813
+ // ============================================================================
814
+ export async function processEventTriggers(supabase) {
815
+ // Atomically claim a batch of pending events using FOR UPDATE SKIP LOCKED
816
+ // Falls back to SELECT+UPDATE if the RPC doesn't exist yet
817
+ let events = null;
818
+ const { data: claimed, error: claimErr } = await supabase.rpc("claim_pending_events", { batch_size: 20 });
819
+ if (!claimErr && claimed?.length) {
820
+ events = claimed;
821
+ }
822
+ else {
823
+ // Fallback: non-atomic claim (SELECT then UPDATE)
824
+ if (claimErr)
825
+ log.debug({ err: claimErr.message }, "claim_pending_events RPC unavailable, using fallback");
826
+ const { data: fallbackEvents } = await supabase.from("automation_events")
827
+ .select("id, store_id, event_type, event_payload, source")
828
+ .eq("status", "pending")
829
+ .order("created_at", { ascending: true })
830
+ .limit(20);
831
+ if (fallbackEvents?.length) {
832
+ const eventIds = fallbackEvents.map(e => e.id);
833
+ await supabase.from("automation_events")
834
+ .update({ status: "processing" })
835
+ .in("id", eventIds);
836
+ events = fallbackEvents;
837
+ }
838
+ }
839
+ if (!events?.length)
840
+ return 0;
841
+ // P2 FIX: Batch-load ALL active subscriptions once instead of per-event queries.
842
+ // Group by (store_id, event_type) for O(1) in-memory lookup per event.
843
+ const uniqueStoreIds = Array.from(new Set(events.map(e => e.store_id)));
844
+ const uniqueEventTypes = Array.from(new Set(events.map(e => e.event_type)));
845
+ const { data: allSubs } = await supabase.from("workflow_event_subscriptions")
846
+ .select("id, workflow_id, filter_expression, store_id, event_type")
847
+ .in("store_id", uniqueStoreIds)
848
+ .in("event_type", uniqueEventTypes)
849
+ .eq("is_active", true);
850
+ // Build lookup map: "store_id:event_type" -> subscriptions[]
851
+ const subsMap = new Map();
852
+ for (const sub of (allSubs || [])) {
853
+ const key = `${sub.store_id}:${sub.event_type}`;
854
+ if (!subsMap.has(key))
855
+ subsMap.set(key, []);
856
+ subsMap.get(key).push(sub);
857
+ }
858
+ let processed = 0;
859
+ for (const event of events) {
860
+ try {
861
+ // P2 FIX: In-memory lookup instead of per-event DB query
862
+ const subs = subsMap.get(`${event.store_id}:${event.event_type}`) || [];
863
+ if (!subs.length) {
864
+ // No subscribers — mark processed and move on
865
+ await supabase.from("automation_events")
866
+ .update({ status: "processed", processed_at: new Date().toISOString() })
867
+ .eq("id", event.id);
868
+ processed++;
869
+ continue;
870
+ }
871
+ for (const sub of subs) {
872
+ // Optional filter expression evaluation
873
+ if (sub.filter_expression) {
874
+ try {
875
+ const ctx = {
876
+ trigger: event.event_payload || {},
877
+ steps: {},
878
+ };
879
+ const pass = evaluateCondition(sub.filter_expression, ctx);
880
+ if (!pass)
881
+ continue; // Filter didn't match — skip this subscription
882
+ }
883
+ catch {
884
+ // Filter eval error — skip rather than block
885
+ continue;
886
+ }
887
+ }
888
+ // Start a workflow run for each matching subscription
889
+ const idempotencyKey = `event:${event.id}:${sub.workflow_id}`;
890
+ const { data: result, error: startErr } = await supabase.rpc("start_workflow_run", {
891
+ p_workflow_id: sub.workflow_id,
892
+ p_store_id: event.store_id,
893
+ p_trigger_type: "event",
894
+ p_trigger_payload: {
895
+ ...(event.event_payload || {}),
896
+ _event_id: event.id,
897
+ _event_type: event.event_type,
898
+ _event_source: event.source,
899
+ },
900
+ p_idempotency_key: idempotencyKey,
901
+ });
902
+ if (startErr || !result?.success) {
903
+ log.error({ workflowId: sub.workflow_id, eventId: event.id, err: startErr?.message || result?.error }, "failed to start event-triggered workflow");
904
+ continue;
905
+ }
906
+ // Assign trace ID and run inline chain for immediate execution
907
+ if (result.run_id && !result.deduplicated) {
908
+ const traceId = randomUUID();
909
+ await supabase.from("workflow_runs").update({ trace_id: traceId }).eq("id", result.run_id);
910
+ try {
911
+ await executeInlineChain(supabase, result.run_id);
912
+ }
913
+ catch (err) {
914
+ log.error({ runId: result.run_id, err: err.message }, "inline chain failed for event run");
915
+ }
916
+ }
917
+ }
918
+ // Mark event as processed
919
+ await supabase.from("automation_events")
920
+ .update({ status: "processed", processed_at: new Date().toISOString() })
921
+ .eq("id", event.id);
922
+ processed++;
923
+ }
924
+ catch (err) {
925
+ // Mark event as failed
926
+ await supabase.from("automation_events")
927
+ .update({
928
+ status: "failed",
929
+ processed_at: new Date().toISOString(),
930
+ error_message: sanitizeError(err),
931
+ })
932
+ .eq("id", event.id);
933
+ log.error({ eventId: event.id, err: sanitizeError(err) }, "error processing event trigger");
934
+ }
935
+ }
936
+ return processed;
937
+ }
938
+ // ============================================================================
939
+ // DEAD LETTER QUEUE — archive failed runs for investigation
940
+ // ============================================================================
941
+ async function archiveToDlq(supabase, runId, workflowId, storeId, workflowName) {
942
+ const { data: run } = await supabase.from("workflow_runs")
943
+ .select("error_message, error_step_key, trigger_type, trigger_payload, step_outputs, duration_ms")
944
+ .eq("id", runId).single();
945
+ if (!run)
946
+ return;
947
+ const { error } = await supabase.from("workflow_dlq").insert({
948
+ store_id: storeId,
949
+ run_id: runId,
950
+ workflow_id: workflowId,
951
+ workflow_name: workflowName || null,
952
+ error_message: run.error_message,
953
+ error_step_key: run.error_step_key,
954
+ trigger_type: run.trigger_type,
955
+ trigger_payload: run.trigger_payload || {},
956
+ step_outputs: run.step_outputs || {},
957
+ run_duration_ms: run.duration_ms,
958
+ });
959
+ if (error)
960
+ log.warn({ err: error.message, runId, workflowId }, "archiveToDlq insert failed");
961
+ }
962
+ /**
963
+ * Process-isolated JS code execution via persistent worker pool.
964
+ * Falls back to one-shot fork if pool is unavailable.
965
+ * A crash, OOM, or infinite loop in user code cannot take down the server.
966
+ */
967
+ async function executeCodeStepIsolated(config, ctx) {
968
+ const code = config.code;
969
+ if (!code)
970
+ return { success: false, error: "No code in code step config" };
971
+ const language = config.language || "javascript";
972
+ if (language === "python") {
973
+ // P0 FIX: Python regex sandbox is trivially bypassable (string concatenation,
974
+ // __builtins__, globals() aliasing). Hard-fail until a real Python sandbox
975
+ // (e.g. Pyodide/WASM or containerized execution) is available.
976
+ return { success: false, error: "Python code execution is temporarily disabled — sandbox under hardening. Use JavaScript code steps instead." };
977
+ }
978
+ const timeoutMs = config.timeout_ms || CODE_TIMEOUT_MS;
979
+ try {
980
+ const result = await executeWithPool({
981
+ code,
982
+ context: { steps: ctx.steps, trigger: ctx.trigger, input: ctx.input },
983
+ timeoutMs,
984
+ });
985
+ // P3 FIX: Enforce same 100KB output cap as Python code steps
986
+ if (result.success && result.output) {
987
+ const outputStr = JSON.stringify(result.output);
988
+ if (outputStr.length > CODE_OUTPUT_MAX) {
989
+ result.output = { result: outputStr.slice(0, CODE_OUTPUT_MAX) + "\n[output truncated at 100KB]", truncated: true };
990
+ }
991
+ }
992
+ return result;
993
+ }
994
+ catch (err) {
995
+ // P0 FIX: Hard failure — no in-process fallback (SSRF risk via unrestricted fetch in sandbox)
996
+ log.error({ err: err.message }, "worker pool execution failed, no fallback");
997
+ return { success: false, error: "Code execution unavailable — worker pool failed" };
998
+ }
999
+ }
1000
+ // P0 FIX: executeCodeStepInProcess removed — in-process fallback was a security risk
1001
+ // (unrestricted fetch in sandbox = SSRF). All code steps now require the worker pool.
1002
+ async function executePythonCode(_code, _ctx) {
1003
+ return { success: false, error: "Python code execution is disabled — sandbox under hardening" };
1004
+ }
1005
+ // ============================================================================
1006
+ // CIRCUIT BREAKER (per user_tool)
1007
+ // ============================================================================
1008
+ async function checkToolCircuitBreaker(supabase, toolId) {
1009
+ const { data } = await supabase.from("user_tools")
1010
+ .select("circuit_breaker_state, circuit_breaker_tripped_at, circuit_breaker_cooldown_seconds")
1011
+ .eq("id", toolId).single();
1012
+ if (!data)
1013
+ return { allowed: true };
1014
+ if (data.circuit_breaker_state === "open") {
1015
+ const trippedAt = new Date(data.circuit_breaker_tripped_at).getTime();
1016
+ const cooldownMs = (data.circuit_breaker_cooldown_seconds || 300) * 1000;
1017
+ if (Date.now() < trippedAt + cooldownMs) {
1018
+ return { allowed: false, reason: `Circuit breaker open (cooldown until ${new Date(trippedAt + cooldownMs).toISOString()})` };
1019
+ }
1020
+ await supabase.from("user_tools").update({ circuit_breaker_state: "half_open" }).eq("id", toolId);
1021
+ }
1022
+ return { allowed: true };
1023
+ }
1024
+ // P1 FIX: Transient error patterns — these indicate network/infra issues, not broken tools.
1025
+ // Circuit breaker should only trip on persistent tool-level errors (auth, 404, bad config).
1026
+ const TRANSIENT_ERROR_PATTERNS = [
1027
+ /timed?\s*out/i, /timeout/i, /ECONNREFUSED/i, /ECONNRESET/i, /ETIMEDOUT/i,
1028
+ /fetch failed/i, /network/i, /socket hang up/i, /EPIPE/i, /EHOSTUNREACH/i,
1029
+ /503 Service Unavailable/i, /502 Bad Gateway/i, /429 Too Many Requests/i,
1030
+ ];
1031
+ function isTransientError(errorMessage) {
1032
+ if (!errorMessage)
1033
+ return false;
1034
+ return TRANSIENT_ERROR_PATTERNS.some(pattern => pattern.test(errorMessage));
1035
+ }
1036
+ async function updateToolCircuitBreaker(supabase, toolId, success, errorMessage) {
1037
+ if (success) {
1038
+ await supabase.from("user_tools").update({ circuit_breaker_state: "closed", circuit_breaker_failures: 0 }).eq("id", toolId);
1039
+ return;
1040
+ }
1041
+ // P1 FIX: Skip circuit breaker increment for transient errors (network, timeout, etc.)
1042
+ if (isTransientError(errorMessage)) {
1043
+ log.debug({ toolId, error: errorMessage }, "skipping circuit breaker increment for transient error");
1044
+ return;
1045
+ }
1046
+ const { data } = await supabase.from("user_tools")
1047
+ .select("circuit_breaker_failures, circuit_breaker_threshold").eq("id", toolId).single();
1048
+ if (!data)
1049
+ return;
1050
+ const newFailures = (data.circuit_breaker_failures || 0) + 1;
1051
+ if (newFailures >= (data.circuit_breaker_threshold || 5)) {
1052
+ await supabase.from("user_tools").update({
1053
+ circuit_breaker_state: "open", circuit_breaker_failures: newFailures,
1054
+ circuit_breaker_tripped_at: new Date().toISOString(),
1055
+ }).eq("id", toolId);
1056
+ const { error: toolCbErr } = await supabase.from("audit_logs").insert({
1057
+ action: "workflow.circuit_breaker.tripped", severity: "warning",
1058
+ resource_type: "user_tool", resource_id: toolId, source: "workflow_engine",
1059
+ details: { failures: newFailures, threshold: data.circuit_breaker_threshold },
1060
+ });
1061
+ if (toolCbErr)
1062
+ log.warn({ err: toolCbErr.message, toolId }, "tool circuit breaker audit failed");
1063
+ }
1064
+ else {
1065
+ await supabase.from("user_tools").update({ circuit_breaker_failures: newFailures }).eq("id", toolId);
1066
+ }
1067
+ }
1068
+ async function handleWorkflowCircuitBreaker(supabase, workflowId, success, errorMessage) {
1069
+ if (success) {
1070
+ await supabase.from("workflows").update({ circuit_breaker_state: "closed", circuit_breaker_failures: 0 }).eq("id", workflowId);
1071
+ return;
1072
+ }
1073
+ // P1 FIX: Skip circuit breaker increment for transient errors
1074
+ if (isTransientError(errorMessage)) {
1075
+ log.debug({ workflowId, error: errorMessage }, "skipping workflow circuit breaker increment for transient error");
1076
+ return;
1077
+ }
1078
+ const { data } = await supabase.from("workflows")
1079
+ .select("circuit_breaker_failures, circuit_breaker_threshold").eq("id", workflowId).single();
1080
+ if (!data)
1081
+ return;
1082
+ const newFailures = (data.circuit_breaker_failures || 0) + 1;
1083
+ if (newFailures >= (data.circuit_breaker_threshold || 5)) {
1084
+ await supabase.from("workflows").update({
1085
+ circuit_breaker_state: "open", circuit_breaker_failures: newFailures,
1086
+ circuit_breaker_tripped_at: new Date().toISOString(),
1087
+ }).eq("id", workflowId);
1088
+ const { error: wfCbErr } = await supabase.from("audit_logs").insert({
1089
+ action: "workflow.circuit_breaker.tripped", severity: "warning",
1090
+ resource_type: "workflow", resource_id: workflowId, source: "workflow_engine",
1091
+ details: { failures: newFailures, threshold: data.circuit_breaker_threshold },
1092
+ });
1093
+ if (wfCbErr)
1094
+ log.warn({ err: wfCbErr.message, workflowId }, "workflow circuit breaker audit failed");
1095
+ }
1096
+ else {
1097
+ await supabase.from("workflows").update({ circuit_breaker_failures: newFailures }).eq("id", workflowId);
1098
+ }
1099
+ }
1100
+ // ============================================================================
1101
+ // CORE ENGINE
1102
+ // ============================================================================
1103
+ /**
1104
+ * Reclaim steps stuck in "running" status longer than their timeout + buffer.
1105
+ * This handles steps that crashed mid-execution (e.g., server restart, OOM).
1106
+ * Steps are reset to "retrying" so the worker can re-execute them.
1107
+ */
1108
+ async function reclaimStaleSteps(supabase) {
1109
+ // P0 FIX: Use step.timeout_seconds * 2 (min 300s) as stale threshold to prevent
1110
+ // double-execution. The old 60s buffer was too aggressive for long-running steps.
1111
+ const MIN_STALE_THRESHOLD_MS = 300_000; // 5 min minimum before reclaim
1112
+ const MAX_STALE_AGE_MS = 1_200_000; // Hard ceiling: 20 min = always stale
1113
+ const { data: staleSteps } = await supabase.from("workflow_step_runs")
1114
+ .select(`
1115
+ id, step_key, run_id, attempt_count, max_attempts, started_at,
1116
+ workflow_steps!inner(timeout_seconds)
1117
+ `)
1118
+ .eq("status", "running")
1119
+ .not("started_at", "is", null)
1120
+ .limit(50);
1121
+ if (!staleSteps?.length)
1122
+ return 0;
1123
+ let reclaimed = 0;
1124
+ const now = Date.now();
1125
+ for (const step of staleSteps) {
1126
+ const timeoutSec = step.workflow_steps?.timeout_seconds || 120;
1127
+ // P0 FIX: threshold = max(300s, timeout * 2) — gives step enough time to complete
1128
+ const staleThreshold = Math.min(Math.max(MIN_STALE_THRESHOLD_MS, timeoutSec * 2000), MAX_STALE_AGE_MS);
1129
+ const elapsed = now - new Date(step.started_at).getTime();
1130
+ if (elapsed < staleThreshold)
1131
+ continue;
1132
+ const exhaustedRetries = step.attempt_count >= step.max_attempts;
1133
+ if (exhaustedRetries) {
1134
+ // No retries left — mark as failed
1135
+ await supabase.from("workflow_step_runs").update({
1136
+ status: "failed",
1137
+ error_message: `Step stale: running for ${Math.round(elapsed / 1000)}s with no response (retries exhausted)`,
1138
+ completed_at: new Date().toISOString(),
1139
+ }).eq("id", step.id);
1140
+ }
1141
+ else {
1142
+ // Reset to retrying so worker picks it up again
1143
+ await supabase.from("workflow_step_runs").update({
1144
+ status: "retrying",
1145
+ error_message: `Step stale: running for ${Math.round(elapsed / 1000)}s with no response (auto-reclaimed)`,
1146
+ next_retry_at: new Date(now + 5000).toISOString(),
1147
+ }).eq("id", step.id);
1148
+ }
1149
+ reclaimed++;
1150
+ log.warn({ stepRunId: step.id, stepKey: step.step_key, runId: step.run_id, elapsedSec: Math.round(elapsed / 1000), exhaustedRetries }, "reclaimed stale step");
1151
+ }
1152
+ return reclaimed;
1153
+ }
1154
+ export async function processWorkflowSteps(supabase, batchSize = 10) {
1155
+ // Reclaim stale steps first so they become available for claiming
1156
+ const reclaimed = await reclaimStaleSteps(supabase).catch(e => {
1157
+ log.warn({ err: e.message }, "reclaimStaleSteps failed");
1158
+ return 0;
1159
+ });
1160
+ const { data: claimedRaw, error: claimErr } = await supabase.rpc("claim_pending_steps", {
1161
+ batch_size: batchSize,
1162
+ });
1163
+ if (claimErr) {
1164
+ log.error({ err: claimErr.message }, "workflow claim error");
1165
+ return { processed: 0, errors: 1, reclaimed };
1166
+ }
1167
+ let claimed = Array.isArray(claimedRaw) ? claimedRaw : [];
1168
+ if (claimed.length === 0)
1169
+ return { processed: 0, errors: 0, reclaimed };
1170
+ // Circuit breaker enforcement — skip steps from workflows with open breakers
1171
+ const workflowIds = [...new Set(claimed.map(s => s.workflow_id))];
1172
+ const { data: openBreakers } = await supabase.from("workflows")
1173
+ .select("id")
1174
+ .in("id", workflowIds)
1175
+ .eq("circuit_breaker_state", "open");
1176
+ if (openBreakers?.length) {
1177
+ const blockedIds = new Set(openBreakers.map(w => w.id));
1178
+ const blocked = claimed.filter(s => blockedIds.has(s.workflow_id));
1179
+ claimed = claimed.filter(s => !blockedIds.has(s.workflow_id));
1180
+ // Mark blocked steps as skipped and finalize affected runs
1181
+ const affectedRuns = new Map();
1182
+ for (const step of blocked) {
1183
+ await supabase.from("workflow_step_runs").update({
1184
+ status: "skipped",
1185
+ error_message: "Workflow circuit breaker is open — step skipped",
1186
+ completed_at: new Date().toISOString(),
1187
+ }).eq("id", step.step_run_id);
1188
+ affectedRuns.set(step.run_id, step.workflow_id);
1189
+ }
1190
+ // Check completion for affected runs — prevents zombie "running" state
1191
+ for (const [runId, workflowId] of affectedRuns) {
1192
+ await checkWorkflowCompletion(supabase, runId, workflowId).catch(e => log.warn({ runId, err: e.message }, "completion check after circuit breaker skip failed"));
1193
+ }
1194
+ if (blocked.length)
1195
+ log.warn({ skippedSteps: blocked.length, blockedWorkflows: blockedIds.size }, "circuit breaker skipped steps");
1196
+ if (claimed.length === 0)
1197
+ return { processed: blocked.length, errors: 0, reclaimed };
1198
+ }
1199
+ log.info({ stepCount: claimed.length }, "processing workflow steps");
1200
+ let errors = 0;
1201
+ // Batch-fetch trace_ids for all runs in this batch
1202
+ const runIds = [...new Set(claimed.map(s => s.run_id))];
1203
+ const { data: runTraces } = await supabase.from("workflow_runs")
1204
+ .select("id, trace_id").in("id", runIds);
1205
+ const traceMap = new Map((runTraces || []).map(r => [r.id, r.trace_id]));
1206
+ // Partition: email tool steps from for_each need sequential processing with delays
1207
+ // to avoid overwhelming Resend's 2 req/s rate limit
1208
+ const isForEachEmailStep = (s) => s.parent_step_run_id && s.step_type === "tool" &&
1209
+ String(s.step_config?.tool_name || "").includes("email");
1210
+ const emailForEachSteps = claimed.filter(isForEachEmailStep);
1211
+ const otherSteps = claimed.filter(s => !isForEachEmailStep(s));
1212
+ const processStep = async (step) => {
1213
+ try {
1214
+ await applyVersionOverrides(supabase, step);
1215
+ await executeAndAdvance(supabase, step, traceMap.get(step.run_id));
1216
+ }
1217
+ catch (err) {
1218
+ errors++;
1219
+ const errMsg = sanitizeError(err);
1220
+ log.error({ stepKey: step.step_key, runId: step.run_id, err: errMsg }, "step execution error");
1221
+ await supabase.from("workflow_step_runs").update({
1222
+ status: "failed", error_message: errMsg,
1223
+ completed_at: new Date().toISOString(), duration_ms: 0,
1224
+ }).eq("id", step.step_run_id);
1225
+ // Surface error to clients (SSE broadcast + structured error_details on step run)
1226
+ await surfaceStepError(supabase, step, errMsg);
1227
+ // Check if this failure finalizes the run (prevents zombie runs from uncaught errors)
1228
+ await checkWorkflowCompletion(supabase, step.run_id, step.workflow_id).catch(e => log.warn({ runId: step.run_id, err: e.message }, "completion check after step error failed"));
1229
+ }
1230
+ };
1231
+ // Process non-email steps in parallel (existing behavior)
1232
+ await Promise.all(otherSteps.map(processStep));
1233
+ // Process email for_each children sequentially with 550ms throttle
1234
+ for (let i = 0; i < emailForEachSteps.length; i++) {
1235
+ if (i > 0)
1236
+ await new Promise(r => setTimeout(r, 550));
1237
+ await processStep(emailForEachSteps[i]);
1238
+ }
1239
+ return { processed: claimed.length, errors };
1240
+ }
1241
+ /**
1242
+ * Check waiting steps: sub_workflow children completed, parallel/for_each children done.
1243
+ * Called by the persistent worker loop alongside processWorkflowSteps.
1244
+ */
1245
+ export async function processWaitingSteps(supabase) {
1246
+ let resolved = 0;
1247
+ // 0. Expire pending approvals (Phase 2)
1248
+ try {
1249
+ await supabase.rpc("expire_pending_approvals");
1250
+ }
1251
+ catch (err) {
1252
+ // Non-fatal — RPC may not exist yet if migration not applied
1253
+ log.warn({ err: sanitizeError(err) }, "expire_pending_approvals error");
1254
+ }
1255
+ // 1. Sub-workflow steps waiting for child runs to complete
1256
+ const { data: subWfSteps } = await supabase
1257
+ .from("workflow_step_runs")
1258
+ .select("id, run_id, step_key, child_run_id, step_type")
1259
+ .eq("status", "waiting")
1260
+ .eq("step_type", "sub_workflow")
1261
+ .not("child_run_id", "is", null)
1262
+ .limit(50);
1263
+ if (subWfSteps?.length) {
1264
+ const childRunIds = subWfSteps.map(s => s.child_run_id).filter(Boolean);
1265
+ const { data: childRuns } = await supabase
1266
+ .from("workflow_runs")
1267
+ .select("id, status, step_outputs, error_message")
1268
+ .in("id", childRunIds)
1269
+ .in("status", ["success", "failed"]);
1270
+ if (childRuns?.length) {
1271
+ const runMap = new Map(childRuns.map(r => [r.id, r]));
1272
+ for (const step of subWfSteps) {
1273
+ const childRun = runMap.get(step.child_run_id);
1274
+ if (!childRun)
1275
+ continue;
1276
+ const success = childRun.status === "success";
1277
+ await supabase.from("workflow_step_runs").update({
1278
+ status: success ? "success" : "failed",
1279
+ output: childRun.step_outputs,
1280
+ error_message: success ? null : childRun.error_message,
1281
+ completed_at: new Date().toISOString(),
1282
+ }).eq("id", step.id);
1283
+ // Accumulate output + advance
1284
+ await accumulateAndAdvance(supabase, step.id, step.run_id, step.step_key, success, childRun.step_outputs, childRun.error_message);
1285
+ resolved++;
1286
+ }
1287
+ }
1288
+ }
1289
+ // 2. P1 FIX: Use aggregate RPC to eliminate N+1 queries (was 4 queries per parent)
1290
+ const { data: aggregatedParents, error: aggErr } = await supabase.rpc("get_waiting_parents_with_children");
1291
+ if (aggErr) {
1292
+ // Fallback to old N+1 pattern if RPC doesn't exist yet
1293
+ log.debug({ err: aggErr.message }, "get_waiting_parents_with_children RPC unavailable, using fallback");
1294
+ const { data: waitingParents } = await supabase
1295
+ .from("workflow_step_runs")
1296
+ .select("id, run_id, step_key, step_type, output")
1297
+ .eq("status", "waiting")
1298
+ .in("step_type", ["parallel", "for_each"])
1299
+ .limit(50);
1300
+ if (waitingParents?.length) {
1301
+ for (const parent of waitingParents) {
1302
+ const { count: totalChildren } = await supabase
1303
+ .from("workflow_step_runs")
1304
+ .select("id", { count: "exact", head: true })
1305
+ .eq("parent_step_run_id", parent.id);
1306
+ const { count: doneChildren } = await supabase
1307
+ .from("workflow_step_runs")
1308
+ .select("id", { count: "exact", head: true })
1309
+ .eq("parent_step_run_id", parent.id)
1310
+ .in("status", ["success", "failed", "skipped", "cancelled"]);
1311
+ if (totalChildren && doneChildren && doneChildren >= totalChildren) {
1312
+ const { data: childOutputs } = await supabase
1313
+ .from("workflow_step_runs")
1314
+ .select("step_key, output, status, error_message")
1315
+ .eq("parent_step_run_id", parent.id)
1316
+ .order("created_at", { ascending: true });
1317
+ const outputs = (childOutputs || []).map(c => c.output);
1318
+ const failedKids = (childOutputs || []).filter(c => c.status === "failed");
1319
+ const allSuccess = failedKids.length === 0;
1320
+ await supabase.from("workflow_step_runs").update({
1321
+ status: allSuccess ? "success" : "failed",
1322
+ output: { children: outputs, total: totalChildren, failed: failedKids.length },
1323
+ error_message: allSuccess ? null : failedKids[0]?.error_message,
1324
+ completed_at: new Date().toISOString(),
1325
+ }).eq("id", parent.id);
1326
+ await accumulateAndAdvance(supabase, parent.id, parent.run_id, parent.step_key, allSuccess, { children: outputs }, allSuccess ? null : failedKids[0]?.error_message);
1327
+ resolved++;
1328
+ }
1329
+ }
1330
+ }
1331
+ }
1332
+ else if (aggregatedParents?.length) {
1333
+ for (const parent of aggregatedParents) {
1334
+ if (parent.total_children > 0 && parent.done_children >= parent.total_children) {
1335
+ const childOutputsArr = (parent.child_outputs || []);
1336
+ const outputs = childOutputsArr.map((c) => c.output);
1337
+ const allSuccess = parent.failed_children === 0;
1338
+ const firstError = childOutputsArr.find((c) => c.status === "failed")?.error_message;
1339
+ await supabase.from("workflow_step_runs").update({
1340
+ status: allSuccess ? "success" : "failed",
1341
+ output: { children: outputs, total: parent.total_children, failed: parent.failed_children },
1342
+ error_message: allSuccess ? null : firstError,
1343
+ completed_at: new Date().toISOString(),
1344
+ }).eq("id", parent.parent_id);
1345
+ await accumulateAndAdvance(supabase, parent.parent_id, parent.parent_run_id, parent.parent_step_key, allSuccess, { children: outputs }, allSuccess ? null : firstError);
1346
+ resolved++;
1347
+ }
1348
+ }
1349
+ }
1350
+ return resolved;
1351
+ }
1352
+ /** Helper: after a waiting step resolves, accumulate output and advance the workflow. */
1353
+ async function accumulateAndAdvance(supabase, stepRunId, runId, stepKey, success, output, errorMessage) {
1354
+ // Load run to get current step_outputs + workflow_id + on_success/on_failure
1355
+ const { data: run } = await supabase.from("workflow_runs")
1356
+ .select("workflow_id, step_outputs, store_id").eq("id", runId).single();
1357
+ if (!run)
1358
+ return;
1359
+ const { data: stepDef } = await supabase.from("workflow_step_runs")
1360
+ .select("step_id, step_key").eq("id", stepRunId).single();
1361
+ if (!stepDef)
1362
+ return;
1363
+ // Phase 4: Try versioned step def first, fall back to live table
1364
+ let wsDef = null;
1365
+ const versionedSteps = await loadVersionedSteps(supabase, runId);
1366
+ if (versionedSteps) {
1367
+ const vStep = versionedSteps.find((s) => s.step_key === stepKey);
1368
+ if (vStep)
1369
+ wsDef = { on_success: vStep.on_success, on_failure: vStep.on_failure, max_retries: vStep.max_retries };
1370
+ }
1371
+ if (!wsDef) {
1372
+ const { data } = await supabase.from("workflow_steps")
1373
+ .select("on_success, on_failure, max_retries").eq("id", stepDef.step_id).single();
1374
+ wsDef = data;
1375
+ }
1376
+ // P1 FIX: Atomic step output accumulation — use jsonb_set instead of read-modify-write
1377
+ // This prevents lost updates when multiple steps complete concurrently
1378
+ const stepOutput = { output, status: success ? "success" : "failed" };
1379
+ await supabase.rpc("accumulate_step_output", {
1380
+ p_run_id: runId,
1381
+ p_step_key: stepKey,
1382
+ p_step_output: stepOutput,
1383
+ }).then(({ error: rpcErr }) => {
1384
+ // Fallback to direct update if RPC doesn't exist yet (migration pending)
1385
+ if (rpcErr) {
1386
+ const newOutputs = { ...(run.step_outputs || {}), [stepKey]: stepOutput };
1387
+ return supabase.from("workflow_runs").update({ step_outputs: newOutputs }).eq("id", runId);
1388
+ }
1389
+ });
1390
+ if (success) {
1391
+ if (wsDef?.on_success) {
1392
+ await createNextStepRunByKey(supabase, runId, run.workflow_id, wsDef.on_success);
1393
+ }
1394
+ else {
1395
+ await checkWorkflowCompletion(supabase, runId, run.workflow_id);
1396
+ }
1397
+ }
1398
+ else {
1399
+ if (wsDef?.on_failure) {
1400
+ await createNextStepRunByKey(supabase, runId, run.workflow_id, wsDef.on_failure);
1401
+ }
1402
+ else {
1403
+ await completeWorkflowRun(supabase, runId, run.workflow_id, run.store_id, "failed", errorMessage, stepKey);
1404
+ }
1405
+ }
1406
+ }
1407
+ export async function executeAndAdvance(supabase, step, traceId) {
1408
+ const startTime = Date.now();
1409
+ // Phase 3.3: OTEL span for step execution
1410
+ const span = startSpan("workflow.step.execute", {
1411
+ "workflow.run_id": step.run_id,
1412
+ "workflow.step_key": step.step_key,
1413
+ "workflow.step_type": step.step_type,
1414
+ "workflow.attempt": step.attempt_count,
1415
+ ...(traceId ? { "workflow.trace_id": traceId } : {}),
1416
+ });
1417
+ // Event journal — step started
1418
+ await logWorkflowEvent(supabase, step.run_id, "step_started", {
1419
+ step_key: step.step_key, step_type: step.step_type, attempt: step.attempt_count,
1420
+ }, step.step_run_id);
1421
+ // Step result caching — skip successful steps on retry (idempotent re-execution)
1422
+ if (step.attempt_count > 1) {
1423
+ const { data: prevRun } = await supabase.from("workflow_step_runs")
1424
+ .select("status, output").eq("run_id", step.run_id).eq("step_key", step.step_key)
1425
+ .eq("status", "success").neq("id", step.step_run_id).limit(1);
1426
+ if (prevRun?.length) {
1427
+ log.info({ stepKey: step.step_key, runId: step.run_id }, "step already succeeded, using cached result");
1428
+ await supabase.from("workflow_step_runs").update({
1429
+ status: "success", output: prevRun[0].output,
1430
+ completed_at: new Date().toISOString(), duration_ms: Date.now() - startTime,
1431
+ }).eq("id", step.step_run_id);
1432
+ await logWorkflowEvent(supabase, step.run_id, "step_cached", { step_key: step.step_key }, step.step_run_id);
1433
+ const nextStepKey = step.on_success;
1434
+ if (!nextStepKey) {
1435
+ await checkWorkflowCompletion(supabase, step.run_id, step.workflow_id);
1436
+ }
1437
+ else {
1438
+ await createNextStepRunByKey(supabase, step.run_id, step.workflow_id, nextStepKey);
1439
+ }
1440
+ return;
1441
+ }
1442
+ }
1443
+ // Flow control — check concurrency/rate limits before execution
1444
+ const flowCheck = await checkFlowControl(supabase, step);
1445
+ if (!flowCheck.allowed) {
1446
+ // Requeue step with short delay for flow control backoff
1447
+ await supabase.from("workflow_step_runs").update({
1448
+ status: "retrying",
1449
+ next_retry_at: new Date(Date.now() + 2000).toISOString(), // retry in 2s
1450
+ output: { flow_control: flowCheck.reason },
1451
+ }).eq("id", step.step_run_id);
1452
+ await logWorkflowEvent(supabase, step.run_id, "step_throttled", { reason: flowCheck.reason }, step.step_run_id);
1453
+ return;
1454
+ }
1455
+ // Build template context
1456
+ const ctx = {
1457
+ steps: {},
1458
+ trigger: step.trigger_payload || {},
1459
+ input: step.input || undefined,
1460
+ workflow: {
1461
+ id: step.workflow_id,
1462
+ store_id: step.store_id,
1463
+ },
1464
+ run: {
1465
+ id: step.run_id,
1466
+ workflow_id: step.workflow_id,
1467
+ store_id: step.store_id,
1468
+ },
1469
+ };
1470
+ if (step.step_outputs && typeof step.step_outputs === "object") {
1471
+ for (const [key, val] of Object.entries(step.step_outputs)) {
1472
+ if (val && typeof val === "object") {
1473
+ ctx.steps[key] = val;
1474
+ }
1475
+ }
1476
+ }
1477
+ let result;
1478
+ // Enforce step-level timeout from step column (default 30s)
1479
+ const stepTimeoutSec = step.timeout_seconds || step.step_config.timeout_seconds || 30;
1480
+ const stepTimeoutMs = stepTimeoutSec * 1000;
1481
+ let stepTimer;
1482
+ const stepTimeoutPromise = new Promise((_, reject) => {
1483
+ stepTimer = setTimeout(() => reject(new Error(`Step timed out after ${stepTimeoutSec}s`)), stepTimeoutMs);
1484
+ });
1485
+ try {
1486
+ switch (step.step_type) {
1487
+ case "tool":
1488
+ result = await Promise.race([executeToolStep(supabase, step.step_config, ctx, step.store_id, traceId), stepTimeoutPromise]);
1489
+ break;
1490
+ case "condition":
1491
+ result = executeConditionStep(step.step_config, ctx);
1492
+ break;
1493
+ case "transform":
1494
+ result = executeTransformStep(step.step_config, ctx);
1495
+ break;
1496
+ case "delay": {
1497
+ // First attempt: set the delay. Second attempt (after delay): success.
1498
+ if (step.attempt_count <= 1) {
1499
+ const delaySec = step.step_config.seconds || 60;
1500
+ await supabase.from("workflow_step_runs").update({
1501
+ status: "retrying",
1502
+ output: { delay_seconds: delaySec, resume_at: new Date(Date.now() + delaySec * 1000).toISOString() },
1503
+ next_retry_at: new Date(Date.now() + delaySec * 1000).toISOString(),
1504
+ }).eq("id", step.step_run_id);
1505
+ clearTimeout(stepTimer);
1506
+ return; // Worker picks it up after delay
1507
+ }
1508
+ result = { success: true, output: { delayed: true, seconds: step.step_config.seconds } };
1509
+ break;
1510
+ }
1511
+ case "agent":
1512
+ result = await Promise.race([executeAgentStep(step.step_config, ctx, step.store_id, supabase, step, traceId), stepTimeoutPromise]);
1513
+ break;
1514
+ case "sub_workflow": {
1515
+ const childWfId = resolveTemplate((step.step_config.workflow_id || ""), ctx);
1516
+ if (!childWfId) {
1517
+ result = { success: false, error: "No workflow_id in sub_workflow config" };
1518
+ break;
1519
+ }
1520
+ const payloadTemplate = (step.step_config.trigger_payload_template || step.step_config.trigger_payload || {});
1521
+ const payload = resolveTemplate(payloadTemplate, ctx);
1522
+ const { data: startResult } = await supabase.rpc("start_workflow_run", {
1523
+ p_workflow_id: childWfId,
1524
+ p_store_id: step.store_id,
1525
+ p_trigger_type: "sub_workflow",
1526
+ p_trigger_payload: payload,
1527
+ });
1528
+ if (!startResult?.success) {
1529
+ result = { success: false, error: startResult?.error || "Failed to start sub-workflow" };
1530
+ break;
1531
+ }
1532
+ // Set to waiting — processWaitingSteps will resolve when child completes
1533
+ await supabase.from("workflow_step_runs").update({
1534
+ status: "waiting",
1535
+ child_run_id: startResult.run_id,
1536
+ output: { child_run_id: startResult.run_id, child_workflow_id: childWfId },
1537
+ }).eq("id", step.step_run_id);
1538
+ clearTimeout(stepTimer);
1539
+ return;
1540
+ }
1541
+ case "parallel": {
1542
+ const stepKeys = (step.step_config.step_keys || step.step_config.child_steps || []);
1543
+ if (stepKeys.length === 0) {
1544
+ result = { success: true, output: { parallel: true, steps: [] } };
1545
+ break;
1546
+ }
1547
+ if (stepKeys.length > MAX_PARALLEL_CHILDREN) {
1548
+ result = { success: false, error: `Parallel step has ${stepKeys.length} children, exceeding limit of ${MAX_PARALLEL_CHILDREN}` };
1549
+ break;
1550
+ }
1551
+ const { data: steps } = await supabase.from("workflow_steps")
1552
+ .select("id, step_key, step_type, max_retries")
1553
+ .eq("workflow_id", step.workflow_id).in("step_key", stepKeys);
1554
+ if (steps?.length) {
1555
+ await supabase.from("workflow_step_runs").insert(steps.map(s => ({
1556
+ run_id: step.run_id, step_id: s.id, step_key: s.step_key,
1557
+ step_type: s.step_type, status: "pending",
1558
+ max_attempts: s.max_retries ?? 3, parent_step_run_id: step.step_run_id,
1559
+ })));
1560
+ }
1561
+ await supabase.from("workflow_step_runs").update({
1562
+ status: "waiting", output: { waiting_for: stepKeys },
1563
+ }).eq("id", step.step_run_id);
1564
+ clearTimeout(stepTimer);
1565
+ return; // processWaitingSteps resolves when all children complete
1566
+ }
1567
+ case "for_each": {
1568
+ const itemsExpr = step.step_config.items;
1569
+ const targetStepKey = step.step_config.step_key;
1570
+ if (!itemsExpr || !targetStepKey) {
1571
+ result = { success: false, error: "for_each requires items and step_key in config" };
1572
+ break;
1573
+ }
1574
+ const items = resolveTemplate(itemsExpr, ctx);
1575
+ if (!Array.isArray(items)) {
1576
+ result = { success: false, error: `for_each items resolved to ${typeof items}, expected array` };
1577
+ break;
1578
+ }
1579
+ if (items.length === 0) {
1580
+ result = { success: true, output: { children: [], total: 0 } };
1581
+ break;
1582
+ }
1583
+ // P2 FIX: Enforce max items limit to prevent runaway step creation
1584
+ const maxItems = step.step_config.max_items || MAX_FOR_EACH_ITEMS;
1585
+ if (items.length > maxItems) {
1586
+ result = { success: false, error: `for_each exceeded maximum of ${maxItems} items (got ${items.length}). Increase limit in step config or paginate.` };
1587
+ break;
1588
+ }
1589
+ // Look up target step definition
1590
+ const { data: targetStep } = await supabase.from("workflow_steps")
1591
+ .select("id, step_key, step_type, max_retries")
1592
+ .eq("workflow_id", step.workflow_id).eq("step_key", targetStepKey).single();
1593
+ if (!targetStep) {
1594
+ result = { success: false, error: `for_each target step '${targetStepKey}' not found` };
1595
+ break;
1596
+ }
1597
+ // Create a step_run per item with the item as input
1598
+ await supabase.from("workflow_step_runs").insert(items.map((item, idx) => ({
1599
+ run_id: step.run_id, step_id: targetStep.id,
1600
+ step_key: `${targetStepKey}[${idx}]`, step_type: targetStep.step_type,
1601
+ status: "pending", max_attempts: targetStep.max_retries ?? 3,
1602
+ parent_step_run_id: step.step_run_id, input: item,
1603
+ })));
1604
+ await supabase.from("workflow_step_runs").update({
1605
+ status: "waiting", output: { waiting_for_count: items.length, target_step: targetStepKey },
1606
+ }).eq("id", step.step_run_id);
1607
+ clearTimeout(stepTimer);
1608
+ return;
1609
+ }
1610
+ case "code": {
1611
+ result = await Promise.race([executeCodeStepIsolated(step.step_config, ctx), stepTimeoutPromise]);
1612
+ break;
1613
+ }
1614
+ case "webhook_out":
1615
+ result = await Promise.race([executeWebhookOutStep(step.step_config, ctx), stepTimeoutPromise]);
1616
+ break;
1617
+ case "noop":
1618
+ result = executeNoopStep();
1619
+ break;
1620
+ case "llm_batch":
1621
+ result = await Promise.race([executeLlmBatchStep(step.step_config, ctx), stepTimeoutPromise]);
1622
+ break;
1623
+ case "approval": {
1624
+ const approvalResult = await executeApprovalStep(supabase, step, ctx);
1625
+ if (approvalResult === "waiting") {
1626
+ clearTimeout(stepTimer);
1627
+ return;
1628
+ }
1629
+ result = approvalResult;
1630
+ break;
1631
+ }
1632
+ // Custom step — POSTs workflow context to a user-defined URL and uses the response
1633
+ case "custom": {
1634
+ const customUrl = resolveTemplate((step.step_config.url || step.step_config.endpoint), ctx);
1635
+ if (!customUrl) {
1636
+ result = { success: false, error: "Custom step requires url in config" };
1637
+ break;
1638
+ }
1639
+ // P0 FIX: Use async validateUrl (DNS resolve-then-check) instead of sync isBlockedUrl
1640
+ const customSsrfError = await validateUrl(customUrl);
1641
+ if (customSsrfError) {
1642
+ result = { success: false, error: `Custom step blocked: ${customSsrfError}` };
1643
+ break;
1644
+ }
1645
+ try {
1646
+ const customHeaders = { "Content-Type": "application/json" };
1647
+ if (step.step_config.headers && typeof step.step_config.headers === "object") {
1648
+ for (const [k, v] of Object.entries(step.step_config.headers)) {
1649
+ customHeaders[k] = resolveTemplate(v, ctx);
1650
+ }
1651
+ }
1652
+ const customBody = JSON.stringify({
1653
+ step_key: step.step_key,
1654
+ run_id: step.run_id,
1655
+ workflow_id: step.workflow_id,
1656
+ input: step.input,
1657
+ step_outputs: step.step_outputs,
1658
+ trigger_payload: step.trigger_payload,
1659
+ config: step.step_config.payload_config || {},
1660
+ });
1661
+ const ctrl = new AbortController();
1662
+ const timer = setTimeout(() => ctrl.abort(), 30_000);
1663
+ const resp = await fetch(customUrl, { method: "POST", headers: customHeaders, body: customBody, signal: ctrl.signal });
1664
+ clearTimeout(timer);
1665
+ const respData = resp.headers.get("content-type")?.includes("json")
1666
+ ? await resp.json() : await resp.text();
1667
+ if (!resp.ok) {
1668
+ result = { success: false, error: `Custom step HTTP ${resp.status}: ${String(respData).substring(0, 500)}` };
1669
+ }
1670
+ else {
1671
+ // Support branch routing from custom step response
1672
+ const branch = typeof respData === "object" && respData?.branch ? respData.branch : undefined;
1673
+ result = { success: true, output: respData, branch };
1674
+ }
1675
+ }
1676
+ catch (err) {
1677
+ result = { success: false, error: err.name === "AbortError" ? "Custom step timed out" : sanitizeError(err) };
1678
+ }
1679
+ break;
1680
+ }
1681
+ // Waitpoint — generalized wait-for-external-signal (subsumes approval, webhook callback, cross-workflow)
1682
+ case "waitpoint": {
1683
+ // Second pass — resumed with completion data
1684
+ if (step.input && typeof step.input === "object" && step.input.waitpoint_completed) {
1685
+ result = { success: true, output: step.input.waitpoint_data || {} };
1686
+ break;
1687
+ }
1688
+ // First pass — create waitpoint token and pause
1689
+ const waitpointToken = randomUUID();
1690
+ const waitpointTimeout = step.step_config.timeout_seconds || 86400;
1691
+ const waitpointExpires = new Date(Date.now() + waitpointTimeout * 1000).toISOString();
1692
+ await supabase.from("waitpoint_tokens").insert({
1693
+ token: waitpointToken,
1694
+ run_id: step.run_id,
1695
+ step_run_id: step.step_run_id,
1696
+ store_id: step.store_id,
1697
+ expires_at: waitpointExpires,
1698
+ label: step.step_config.label || step.step_key,
1699
+ });
1700
+ await supabase.from("workflow_step_runs").update({
1701
+ status: "waiting",
1702
+ output: { waiting_for: "waitpoint", token: waitpointToken, expires_at: waitpointExpires },
1703
+ }).eq("id", step.step_run_id);
1704
+ await logWorkflowEvent(supabase, step.run_id, "waitpoint_created", { token: waitpointToken }, step.step_run_id);
1705
+ clearTimeout(stepTimer);
1706
+ return;
1707
+ }
1708
+ default:
1709
+ result = { success: false, error: `Unknown step type: ${step.step_type}` };
1710
+ }
1711
+ }
1712
+ catch (timeoutErr) {
1713
+ result = { success: false, error: timeoutErr.message || `Step timed out after ${stepTimeoutSec}s` };
1714
+ }
1715
+ finally {
1716
+ clearTimeout(stepTimer);
1717
+ }
1718
+ const durationMs = Date.now() - startTime;
1719
+ // Phase 3.3: End OTEL span with result attributes
1720
+ if (result.success) {
1721
+ span.end({ "workflow.duration_ms": durationMs, "workflow.status": "success" });
1722
+ }
1723
+ else {
1724
+ span.setError(result.error || "step failed");
1725
+ span.end({ "workflow.duration_ms": durationMs, "workflow.status": "failed" });
1726
+ }
1727
+ // Event journal — step completed
1728
+ await logWorkflowEvent(supabase, step.run_id, result.success ? "step_completed" : "step_failed", {
1729
+ step_key: step.step_key, duration_ms: durationMs,
1730
+ ...(result.error ? { error: result.error } : {}),
1731
+ ...(result.branch ? { branch: result.branch } : {}),
1732
+ }, step.step_run_id);
1733
+ // Persist step result
1734
+ await supabase.from("workflow_step_runs").update({
1735
+ status: result.success ? "success" : "failed",
1736
+ output: result.output || null,
1737
+ error_message: result.error || null,
1738
+ completed_at: new Date().toISOString(),
1739
+ duration_ms: durationMs,
1740
+ }).eq("id", step.step_run_id);
1741
+ // P4 FIX: Atomically merge step output using jsonb_set to prevent race conditions
1742
+ // Two concurrent steps can no longer overwrite each other's outputs
1743
+ const stepOutput = { output: result.output, status: result.success ? "success" : "failed", duration_ms: durationMs };
1744
+ const { error: rpcError } = await supabase.rpc("accumulate_step_output", {
1745
+ p_run_id: step.run_id,
1746
+ p_step_key: step.step_key,
1747
+ p_step_output: stepOutput,
1748
+ });
1749
+ if (rpcError) {
1750
+ // Retry once — transient connection errors are common under load
1751
+ log.warn({ err: rpcError.message, runId: step.run_id, stepKey: step.step_key }, "accumulate_step_output RPC error, retrying once");
1752
+ const { error: retryError } = await supabase.rpc("accumulate_step_output", {
1753
+ p_run_id: step.run_id,
1754
+ p_step_key: step.step_key,
1755
+ p_step_output: stepOutput,
1756
+ });
1757
+ if (retryError) {
1758
+ // Final fallback: advisory-lock-guarded RPC prevents parallel step race conditions
1759
+ log.warn({ err: retryError.message, runId: step.run_id, stepKey: step.step_key }, "accumulate_step_output retry failed, using locked fallback");
1760
+ const { error: lockError } = await supabase.rpc("accumulate_step_output_locked", {
1761
+ p_run_id: step.run_id,
1762
+ p_step_key: step.step_key,
1763
+ p_step_output: stepOutput,
1764
+ });
1765
+ if (lockError) {
1766
+ // P0 FIX: Throw so the step is marked as failed, not falsely as success
1767
+ throw new Error(`Failed to accumulate step output after all fallbacks: ${lockError.message}`);
1768
+ }
1769
+ }
1770
+ }
1771
+ // Checkpoint — snapshot state after each step for replay/debugging
1772
+ if (result.success) {
1773
+ const { error: cpError } = await supabase.from("workflow_checkpoints").insert({
1774
+ run_id: step.run_id, step_run_id: step.step_run_id, step_key: step.step_key,
1775
+ step_outputs: { ...(step.step_outputs || {}), [step.step_key]: { output: result.output, status: "success", duration_ms: durationMs } },
1776
+ trigger_payload: step.trigger_payload,
1777
+ sequence_number: Object.keys(step.step_outputs || {}).length + 1,
1778
+ });
1779
+ if (cpError)
1780
+ log.warn({ err: cpError.message, runId: step.run_id, stepKey: step.step_key }, "checkpoint insert failed");
1781
+ }
1782
+ // Audit
1783
+ const { error: stepAuditErr } = await supabase.from("audit_logs").insert({
1784
+ action: `workflow.step.${result.success ? "completed" : "failed"}`,
1785
+ severity: result.success ? "info" : "error",
1786
+ store_id: step.store_id, resource_type: "workflow_step_run",
1787
+ resource_id: step.step_run_id, source: "workflow_engine", duration_ms: durationMs,
1788
+ request_id: traceId || null,
1789
+ details: { workflow_id: step.workflow_id, run_id: step.run_id, step_key: step.step_key, step_type: step.step_type, attempt: step.attempt_count },
1790
+ error_message: result.error || null,
1791
+ });
1792
+ if (stepAuditErr)
1793
+ log.warn({ err: stepAuditErr.message, runId: step.run_id }, "step audit insert failed");
1794
+ // Surface step errors to clients via SSE broadcast + structured error_details
1795
+ if (!result.success && result.error) {
1796
+ await surfaceStepError(supabase, step, result.error);
1797
+ }
1798
+ // Child steps (parallel/for_each) — just save result, parent handles advancement
1799
+ if (step.parent_step_run_id) {
1800
+ // If child failed and has retries left, retry it (respects retry_policy)
1801
+ if (!result.success && step.attempt_count < step.max_attempts) {
1802
+ const retryPolicy = step.step_config.retry_policy;
1803
+ const backoffType = retryPolicy?.backoff_type || "exponential";
1804
+ const baseDelay = retryPolicy?.backoff_base_seconds || step.retry_delay_seconds || 10;
1805
+ const maxBackoff = retryPolicy?.max_backoff_seconds || 300;
1806
+ let backoffDelay;
1807
+ switch (backoffType) {
1808
+ case "fixed":
1809
+ backoffDelay = baseDelay;
1810
+ break;
1811
+ case "linear":
1812
+ backoffDelay = baseDelay * step.attempt_count;
1813
+ break;
1814
+ default:
1815
+ backoffDelay = baseDelay * Math.pow(2, step.attempt_count - 1);
1816
+ break;
1817
+ }
1818
+ backoffDelay = Math.min(backoffDelay, maxBackoff);
1819
+ // P3 FIX: Add jitter (50%-100% of computed delay) to prevent thundering herd
1820
+ backoffDelay *= (0.5 + Math.random() * 0.5);
1821
+ await supabase.from("workflow_step_runs").update({
1822
+ status: "retrying",
1823
+ next_retry_at: new Date(Date.now() + backoffDelay * 1000).toISOString(),
1824
+ }).eq("id", step.step_run_id);
1825
+ }
1826
+ return; // Parent's processWaitingSteps handles advancement
1827
+ }
1828
+ // Handle failure — configurable retry policy
1829
+ if (!result.success) {
1830
+ if (step.attempt_count < step.max_attempts) {
1831
+ // Check retry_on filter — only retry if error matches pattern (if configured)
1832
+ const retryPolicy = step.step_config.retry_policy;
1833
+ const retryOn = retryPolicy?.retry_on;
1834
+ const shouldRetry = !retryOn?.length || retryOn.some(pattern => result.error?.includes(pattern));
1835
+ if (shouldRetry) {
1836
+ const backoffType = retryPolicy?.backoff_type || "exponential";
1837
+ const baseDelay = retryPolicy?.backoff_base_seconds || step.retry_delay_seconds || 10;
1838
+ const maxBackoff = retryPolicy?.max_backoff_seconds || 300; // 5 min cap
1839
+ let backoffDelay;
1840
+ switch (backoffType) {
1841
+ case "fixed":
1842
+ backoffDelay = baseDelay;
1843
+ break;
1844
+ case "linear":
1845
+ backoffDelay = baseDelay * step.attempt_count;
1846
+ break;
1847
+ default:
1848
+ backoffDelay = baseDelay * Math.pow(2, step.attempt_count - 1);
1849
+ break; // exponential
1850
+ }
1851
+ backoffDelay = Math.min(backoffDelay, maxBackoff);
1852
+ // P3 FIX: Add jitter (50%-100% of computed delay) to prevent thundering herd
1853
+ backoffDelay *= (0.5 + Math.random() * 0.5);
1854
+ await supabase.from("workflow_step_runs").update({
1855
+ status: "retrying",
1856
+ next_retry_at: new Date(Date.now() + backoffDelay * 1000).toISOString(),
1857
+ }).eq("id", step.step_run_id);
1858
+ await logWorkflowEvent(supabase, step.run_id, "step_retrying", {
1859
+ step_key: step.step_key, attempt: step.attempt_count, backoff_type: backoffType, delay_seconds: backoffDelay,
1860
+ }, step.step_run_id);
1861
+ return;
1862
+ }
1863
+ // retry_on filter didn't match — fall through to failure handling
1864
+ }
1865
+ if (step.on_failure) {
1866
+ await createNextStepRunByKey(supabase, step.run_id, step.workflow_id, step.on_failure);
1867
+ }
1868
+ else {
1869
+ await completeWorkflowRun(supabase, step.run_id, step.workflow_id, step.store_id, "failed", result.error, step.step_key);
1870
+ }
1871
+ return;
1872
+ }
1873
+ // Advance — condition steps use branch, otherwise on_success
1874
+ const nextStepKey = result.branch || step.on_success;
1875
+ if (!nextStepKey) {
1876
+ await checkWorkflowCompletion(supabase, step.run_id, step.workflow_id);
1877
+ return;
1878
+ }
1879
+ await createNextStepRunByKey(supabase, step.run_id, step.workflow_id, nextStepKey);
1880
+ }
1881
+ // ============================================================================
1882
+ // WORKFLOW ADVANCEMENT HELPERS
1883
+ // ============================================================================
1884
+ // P1 FIX: Per-run in-memory cache for versioned steps (30s TTL)
1885
+ // Prevents 4x redundant DB calls per step execution
1886
+ // P0 FIX: Capped at 500 entries with LRU eviction to prevent unbounded memory growth
1887
+ const VERSION_CACHE_MAX_SIZE = 500;
1888
+ const versionedStepsCache = new Map();
1889
+ const VERSION_CACHE_TTL_MS = 30_000;
1890
+ export function clearStepCache() {
1891
+ versionedStepsCache.clear();
1892
+ }
1893
+ /** Insert into cache with LRU eviction when size exceeds limit */
1894
+ function versionedStepsCacheSet(key, value) {
1895
+ // If key already exists, delete first so re-insertion moves it to end (most recent)
1896
+ if (versionedStepsCache.has(key)) {
1897
+ versionedStepsCache.delete(key);
1898
+ }
1899
+ versionedStepsCache.set(key, value);
1900
+ // Evict oldest entries (Map iteration order = insertion order) if over limit
1901
+ if (versionedStepsCache.size > VERSION_CACHE_MAX_SIZE) {
1902
+ const excess = versionedStepsCache.size - VERSION_CACHE_MAX_SIZE;
1903
+ let removed = 0;
1904
+ for (const k of versionedStepsCache.keys()) {
1905
+ if (removed >= excess)
1906
+ break;
1907
+ versionedStepsCache.delete(k);
1908
+ removed++;
1909
+ }
1910
+ }
1911
+ }
1912
+ // Periodic cleanup to prevent memory leaks
1913
+ setInterval(() => {
1914
+ const now = Date.now();
1915
+ for (const [key, entry] of versionedStepsCache) {
1916
+ if (now > entry.expiresAt)
1917
+ versionedStepsCache.delete(key);
1918
+ }
1919
+ }, 60_000);
1920
+ /**
1921
+ * Load the versioned steps array for a run. Returns null if no version.
1922
+ * Uses per-run in-memory cache with 30s TTL.
1923
+ */
1924
+ async function loadVersionedSteps(supabase, runId) {
1925
+ const cached = versionedStepsCache.get(runId);
1926
+ if (cached && Date.now() < cached.expiresAt)
1927
+ return cached.data;
1928
+ const { data: run } = await supabase.from("workflow_runs")
1929
+ .select("version_id").eq("id", runId).single();
1930
+ if (!run?.version_id) {
1931
+ versionedStepsCacheSet(runId, { data: null, expiresAt: Date.now() + VERSION_CACHE_TTL_MS });
1932
+ return null;
1933
+ }
1934
+ const { data: version } = await supabase.from("workflow_versions")
1935
+ .select("steps").eq("id", run.version_id).single();
1936
+ const result = (version?.steps && Array.isArray(version.steps)) ? version.steps : null;
1937
+ versionedStepsCacheSet(runId, { data: result, expiresAt: Date.now() + VERSION_CACHE_TTL_MS });
1938
+ return result;
1939
+ }
1940
+ /**
1941
+ * Apply versioned overrides to a claimed step. If the run has a version_id,
1942
+ * replaces step_config, on_success, on_failure with values from the snapshot.
1943
+ */
1944
+ async function applyVersionOverrides(supabase, step) {
1945
+ const versionedSteps = await loadVersionedSteps(supabase, step.run_id);
1946
+ if (!versionedSteps)
1947
+ return;
1948
+ const vStep = versionedSteps.find((s) => s.step_key === step.step_key);
1949
+ if (vStep) {
1950
+ step.step_config = vStep.step_config || step.step_config;
1951
+ step.on_success = vStep.on_success ?? step.on_success;
1952
+ step.on_failure = vStep.on_failure ?? step.on_failure;
1953
+ }
1954
+ }
1955
+ /**
1956
+ * Resolve a step definition by key. If the run has a version_id, load from
1957
+ * the versioned snapshot. Otherwise, load from the live workflow_steps table.
1958
+ */
1959
+ async function resolveStepDef(supabase, runId, workflowId, stepKey) {
1960
+ const versionedSteps = await loadVersionedSteps(supabase, runId);
1961
+ if (versionedSteps) {
1962
+ const step = versionedSteps.find((s) => s.step_key === stepKey);
1963
+ if (step) {
1964
+ return { id: step.id, step_key: step.step_key, step_type: step.step_type, max_retries: step.max_retries ?? 3 };
1965
+ }
1966
+ return null;
1967
+ }
1968
+ // Live table
1969
+ const { data } = await supabase.from("workflow_steps")
1970
+ .select("id, step_key, step_type, max_retries")
1971
+ .eq("workflow_id", workflowId).eq("step_key", stepKey).single();
1972
+ return data;
1973
+ }
1974
+ async function createNextStepRunByKey(supabase, runId, workflowId, stepKey) {
1975
+ const nextStep = await resolveStepDef(supabase, runId, workflowId, stepKey);
1976
+ if (!nextStep) {
1977
+ log.error({ stepKey, workflowId }, "step not found in workflow");
1978
+ const { data: run } = await supabase.from("workflow_runs").select("store_id").eq("id", runId).single();
1979
+ await completeWorkflowRun(supabase, runId, workflowId, run?.store_id, "failed", `Step '${stepKey}' not found`);
1980
+ return null;
1981
+ }
1982
+ // Check step count limit
1983
+ const { data: run } = await supabase.from("workflow_runs")
1984
+ .select("store_id").eq("id", runId).single();
1985
+ const { count } = await supabase.from("workflow_step_runs")
1986
+ .select("id", { count: "exact", head: true }).eq("run_id", runId);
1987
+ const { data: wf } = await supabase.from("workflows")
1988
+ .select("max_steps_per_run").eq("id", workflowId).single();
1989
+ if ((count || 0) >= (wf?.max_steps_per_run || 50)) {
1990
+ await completeWorkflowRun(supabase, runId, workflowId, run?.store_id, "failed", `Step limit exceeded (${wf?.max_steps_per_run || 50})`);
1991
+ return null;
1992
+ }
1993
+ const { data: inserted } = await supabase.from("workflow_step_runs").insert({
1994
+ run_id: runId, step_id: nextStep.id, step_key: nextStep.step_key,
1995
+ step_type: nextStep.step_type, status: "pending", max_attempts: nextStep.max_retries ?? 3,
1996
+ }).select("id").single();
1997
+ return inserted?.id || null;
1998
+ }
1999
+ // ============================================================================
2000
+ // PHASE 1: INLINE EXECUTION — execute steps immediately, no 5s wait
2001
+ // ============================================================================
2002
+ /**
2003
+ * Claim a single pending step for a specific run using atomic RPC.
2004
+ * P0 FIX: Uses claim_step_for_run RPC with FOR UPDATE SKIP LOCKED + attempt_count increment.
2005
+ * Replaces the old SELECT-then-UPDATE pattern that never incremented attempt_count (infinite retries).
2006
+ */
2007
+ async function claimStepForRun(supabase, runId) {
2008
+ const { data, error } = await supabase.rpc("claim_step_for_run", { p_run_id: runId });
2009
+ if (error) {
2010
+ log.error({ err: error.message, runId }, "claim_step_for_run RPC failed");
2011
+ return null;
2012
+ }
2013
+ // RPC returns an array of rows; we expect 0 or 1
2014
+ const row = Array.isArray(data) ? data[0] : data;
2015
+ if (!row)
2016
+ return null;
2017
+ return {
2018
+ step_run_id: row.step_run_id,
2019
+ run_id: row.run_id,
2020
+ workflow_id: row.workflow_id,
2021
+ store_id: row.store_id,
2022
+ step_id: row.step_id,
2023
+ step_key: row.step_key,
2024
+ step_type: row.step_type,
2025
+ step_config: row.step_config || {},
2026
+ on_success: row.on_success,
2027
+ on_failure: row.on_failure,
2028
+ timeout_seconds: row.timeout_seconds || 60,
2029
+ input_schema: row.input_schema,
2030
+ step_outputs: row.step_outputs || {},
2031
+ trigger_payload: row.trigger_payload || {},
2032
+ attempt_count: row.attempt_count || 1,
2033
+ max_attempts: row.max_attempts || 3,
2034
+ max_steps_per_run: row.max_steps_per_run || 50,
2035
+ input: row.input,
2036
+ parent_step_run_id: row.parent_step_run_id,
2037
+ retry_delay_seconds: row.retry_delay_seconds || 10,
2038
+ };
2039
+ }
2040
+ /**
2041
+ * Execute the first pending step of a run inline, then chain subsequent steps.
2042
+ * Depth guard prevents unbounded recursion — worker loop catches anything left.
2043
+ */
2044
+ export async function executeInlineChain(supabase, runId, depth = 0, traceId) {
2045
+ if (depth >= MAX_INLINE_DEPTH) {
2046
+ log.warn({ runId, depthLimit: MAX_INLINE_DEPTH }, "inline depth limit reached, deferring to worker");
2047
+ return;
2048
+ }
2049
+ // Resolve traceId from run record if not passed (first depth only to avoid repeated queries)
2050
+ if (!traceId && depth === 0) {
2051
+ const { data: runData } = await supabase.from("workflow_runs")
2052
+ .select("trace_id").eq("id", runId).single();
2053
+ traceId = runData?.trace_id || undefined;
2054
+ }
2055
+ // H1 FIX: Claim step specifically for this run — cannot steal from other runs
2056
+ const step = await claimStepForRun(supabase, runId);
2057
+ if (!step)
2058
+ return; // No pending steps for this run
2059
+ // Phase 4: Override step_config/on_success/on_failure from version snapshot
2060
+ await applyVersionOverrides(supabase, step);
2061
+ try {
2062
+ await executeAndAdvance(supabase, step, traceId);
2063
+ }
2064
+ catch (err) {
2065
+ const errMsg = sanitizeError(err);
2066
+ log.error({ stepKey: step.step_key, runId, err: errMsg }, "inline step execution error");
2067
+ await supabase.from("workflow_step_runs").update({
2068
+ status: "failed", error_message: errMsg,
2069
+ completed_at: new Date().toISOString(), duration_ms: 0,
2070
+ }).eq("id", step.step_run_id);
2071
+ // Surface error to clients (SSE broadcast + structured error_details on step run)
2072
+ await surfaceStepError(supabase, step, errMsg);
2073
+ return;
2074
+ }
2075
+ // Steps that go async (delay, sub_workflow, parallel, for_each, approval) don't chain
2076
+ const asyncTypes = new Set(["delay", "sub_workflow", "parallel", "for_each", "approval", "waitpoint"]);
2077
+ if (asyncTypes.has(step.step_type))
2078
+ return;
2079
+ // Chain to next step
2080
+ await executeInlineChain(supabase, runId, depth + 1, traceId);
2081
+ }
2082
+ async function checkWorkflowCompletion(supabase, runId, workflowId) {
2083
+ // P1 FIX: Single atomic query to count all step statuses — eliminates race windows
2084
+ // between multiple queries that could see inconsistent state
2085
+ const { data: allSteps } = await supabase.from("workflow_step_runs")
2086
+ .select("step_key, step_id, status, error_message")
2087
+ .eq("run_id", runId);
2088
+ if (!allSteps?.length) {
2089
+ // No steps at all — mark as success (empty workflow)
2090
+ const { data: run } = await supabase.from("workflow_runs").select("store_id").eq("id", runId).single();
2091
+ await completeWorkflowRun(supabase, runId, workflowId, run?.store_id, "success");
2092
+ return;
2093
+ }
2094
+ // Count in-memory from single query result
2095
+ const activeStatuses = new Set(["pending", "running", "retrying", "waiting"]);
2096
+ const activeSteps = allSteps.filter(s => activeStatuses.has(s.status));
2097
+ if (activeSteps.length > 0)
2098
+ return; // Still in progress
2099
+ // All steps are terminal — determine outcome
2100
+ const skippedSteps = allSteps.filter(s => s.status === "skipped");
2101
+ const failedSteps = allSteps.filter(s => s.status === "failed");
2102
+ // Filter to unhandled failures — steps where on_failure is null (no error handler)
2103
+ // M14 FIX: Use versioned step defs when available
2104
+ let failed = [];
2105
+ if (failedSteps.length) {
2106
+ const versionedSteps = await loadVersionedSteps(supabase, runId);
2107
+ if (versionedSteps) {
2108
+ // Use versioned definitions
2109
+ const vStepMap = new Map(versionedSteps.map((s) => [s.step_key, s]));
2110
+ failed = failedSteps.filter(s => {
2111
+ const vStep = vStepMap.get(s.step_key);
2112
+ return !vStep?.on_failure;
2113
+ });
2114
+ }
2115
+ else {
2116
+ // Fall back to live table
2117
+ const stepIds = failedSteps.map(s => s.step_id).filter(Boolean);
2118
+ if (stepIds.length) {
2119
+ const { data: stepDefs } = await supabase.from("workflow_steps")
2120
+ .select("id, on_failure").in("id", stepIds);
2121
+ const defMap = new Map((stepDefs || []).map(d => [d.id, d]));
2122
+ failed = failedSteps.filter(s => {
2123
+ const def = defMap.get(s.step_id);
2124
+ return !def?.on_failure;
2125
+ });
2126
+ }
2127
+ }
2128
+ }
2129
+ const { data: run } = await supabase.from("workflow_runs").select("store_id").eq("id", runId).single();
2130
+ if (failed.length) {
2131
+ await completeWorkflowRun(supabase, runId, workflowId, run?.store_id, "failed", failed[0].error_message, failed[0].step_key);
2132
+ }
2133
+ else if (skippedSteps.length) {
2134
+ // All remaining steps were skipped (circuit breaker) — fail the run
2135
+ await completeWorkflowRun(supabase, runId, workflowId, run?.store_id, "failed", `${skippedSteps.length} step(s) skipped by circuit breaker`, skippedSteps[0].step_key);
2136
+ }
2137
+ else {
2138
+ await completeWorkflowRun(supabase, runId, workflowId, run?.store_id, "success");
2139
+ }
2140
+ }
2141
+ export async function completeWorkflowRun(supabase, runId, workflowId, storeId, status, errorMessage, errorStepKey, traceId) {
2142
+ const { data: run } = await supabase.from("workflow_runs")
2143
+ .select("started_at, trace_id, metadata").eq("id", runId).single();
2144
+ const durationMs = run?.started_at ? Date.now() - new Date(run.started_at).getTime() : null;
2145
+ const resolvedTraceId = traceId || run?.trace_id || null;
2146
+ // Aggregate error_count and error_log from failed step runs
2147
+ const { data: failedStepRuns } = await supabase.from("workflow_step_runs")
2148
+ .select("step_key, step_type, error_message, error_details")
2149
+ .eq("run_id", runId)
2150
+ .eq("status", "failed");
2151
+ const errorCount = failedStepRuns?.length || 0;
2152
+ const errorLog = (failedStepRuns || []).map(sr => ({
2153
+ step_name: sr.step_key,
2154
+ step_type: sr.step_type,
2155
+ error_message: sr.error_message,
2156
+ ...(sr.error_details ? { details: sr.error_details } : {}),
2157
+ }));
2158
+ const updatedMetadata = {
2159
+ ...(run?.metadata || {}),
2160
+ error_count: errorCount,
2161
+ ...(errorLog.length ? { error_log: errorLog } : {}),
2162
+ };
2163
+ // Guard against double-completion: only update if still running
2164
+ const { data: updatedRows, error: updateErr } = await supabase.from("workflow_runs").update({
2165
+ status, error_message: errorMessage || null, error_step_key: errorStepKey || null,
2166
+ completed_at: new Date().toISOString(), duration_ms: durationMs, current_step_key: null,
2167
+ metadata: updatedMetadata,
2168
+ }).eq("id", runId).eq("status", "running").select("id");
2169
+ if (updateErr) {
2170
+ log.warn({ err: updateErr.message, runId, status }, "completeWorkflowRun update failed");
2171
+ return;
2172
+ }
2173
+ if (!updatedRows || updatedRows.length === 0) {
2174
+ log.warn({ runId, status }, "completeWorkflowRun skipped — run already completed by another caller");
2175
+ return;
2176
+ }
2177
+ // Cancel remaining pending steps
2178
+ await supabase.from("workflow_step_runs").update({ status: "cancelled" })
2179
+ .eq("run_id", runId).in("status", ["pending", "retrying", "waiting"]);
2180
+ // Event journal — run completed
2181
+ await logWorkflowEvent(supabase, runId, `run_${status}`, {
2182
+ workflow_id: workflowId, duration_ms: durationMs,
2183
+ ...(errorMessage ? { error: errorMessage } : {}),
2184
+ });
2185
+ // Circuit breaker
2186
+ if (workflowId)
2187
+ await handleWorkflowCircuitBreaker(supabase, workflowId, status === "success", errorMessage);
2188
+ // Audit
2189
+ const { error: runAuditErr } = await supabase.from("audit_logs").insert({
2190
+ action: `workflow.run.${status}`, severity: status === "success" ? "info" : "error",
2191
+ store_id: storeId || null, resource_type: "workflow_run", resource_id: runId,
2192
+ source: "workflow_engine", duration_ms: durationMs,
2193
+ request_id: resolvedTraceId,
2194
+ details: { workflow_id: workflowId, run_id: runId },
2195
+ error_message: errorMessage || null,
2196
+ });
2197
+ if (runAuditErr)
2198
+ log.warn({ err: runAuditErr.message, runId }, "run completion audit failed");
2199
+ // Error notifications
2200
+ if (status === "failed" && workflowId) {
2201
+ await sendErrorNotification(supabase, workflowId, runId, storeId, errorMessage, errorStepKey);
2202
+ // Auto-archive to Dead Letter Queue
2203
+ if (storeId) {
2204
+ try {
2205
+ await archiveToDlq(supabase, runId, workflowId, storeId);
2206
+ }
2207
+ catch (e) {
2208
+ log.warn({ err: e?.message, runId }, "archiveToDlq failed");
2209
+ }
2210
+ }
2211
+ }
2212
+ }
2213
+ async function sendErrorNotification(supabase, workflowId, runId, storeId, errorMessage, errorStepKey) {
2214
+ const { data: wf } = await supabase.from("workflows")
2215
+ .select("name, on_error_webhook_url, on_error_email").eq("id", workflowId).single();
2216
+ if (!wf)
2217
+ return;
2218
+ const errorPayload = {
2219
+ event: "workflow.run.failed",
2220
+ workflow_id: workflowId, workflow_name: wf.name,
2221
+ run_id: runId, error_message: errorMessage, error_step: errorStepKey,
2222
+ timestamp: new Date().toISOString(),
2223
+ };
2224
+ // Webhook notification (with SSRF protection)
2225
+ const errorWebhookSsrf = wf.on_error_webhook_url ? await validateUrl(wf.on_error_webhook_url) : "no URL";
2226
+ if (wf.on_error_webhook_url && !errorWebhookSsrf) {
2227
+ try {
2228
+ const controller = new AbortController();
2229
+ const timer = setTimeout(() => controller.abort(), 10_000);
2230
+ await fetch(wf.on_error_webhook_url, {
2231
+ method: "POST", headers: { "Content-Type": "application/json" },
2232
+ body: JSON.stringify(errorPayload), signal: controller.signal,
2233
+ });
2234
+ clearTimeout(timer);
2235
+ }
2236
+ catch (err) {
2237
+ log.error({ err: sanitizeError(err), workflowId }, "error notification webhook failed");
2238
+ }
2239
+ }
2240
+ // Email notification
2241
+ if (wf.on_error_email && _executeTool && storeId) {
2242
+ try {
2243
+ await _executeTool(supabase, "email", {
2244
+ action: "send", to: wf.on_error_email,
2245
+ subject: `Workflow "${wf.name}" failed`,
2246
+ text: `Workflow "${wf.name}" failed at step "${errorStepKey || "unknown"}".\n\nError: ${errorMessage || "Unknown error"}\n\nRun ID: ${runId}\nTime: ${new Date().toISOString()}`,
2247
+ }, storeId);
2248
+ }
2249
+ catch (err) {
2250
+ log.error({ err: sanitizeError(err), workflowId }, "error notification email failed");
2251
+ }
2252
+ }
2253
+ }
2254
+ // ============================================================================
2255
+ // WEBHOOK INGESTION
2256
+ // ============================================================================
2257
+ export async function handleWebhookIngestion(supabase, slug, rawBody, headers, storeId) {
2258
+ // P0 FIX: Multi-tenancy — scope webhook lookup by store_id when available
2259
+ let endpointQuery = supabase.from("webhook_endpoints")
2260
+ .select("*").eq("slug", slug).eq("is_active", true);
2261
+ if (storeId) {
2262
+ endpointQuery = endpointQuery.eq("store_id", storeId);
2263
+ }
2264
+ const { data: endpoints } = await endpointQuery.limit(1);
2265
+ const endpoint = endpoints?.[0];
2266
+ if (!endpoint)
2267
+ return { status: 404, body: { error: "Webhook endpoint not found" } };
2268
+ // Rate limit
2269
+ const oneMinAgo = new Date(Date.now() - 60_000).toISOString();
2270
+ const { count: recentCount } = await supabase.from("audit_logs")
2271
+ .select("id", { count: "exact", head: true })
2272
+ .eq("resource_type", "webhook_endpoint").eq("resource_id", endpoint.id)
2273
+ .gte("created_at", oneMinAgo);
2274
+ if ((recentCount || 0) >= endpoint.max_requests_per_minute) {
2275
+ return { status: 429, body: { error: "Rate limit exceeded" } };
2276
+ }
2277
+ // HMAC verification
2278
+ if (endpoint.verify_signature) {
2279
+ const signature = headers["x-webhook-signature"] || headers["x-hub-signature-256"] || "";
2280
+ if (!signature)
2281
+ return { status: 401, body: { error: "Missing signature" } };
2282
+ const expected = `sha256=${createHmac("sha256", endpoint.signing_secret).update(rawBody).digest("hex")}`;
2283
+ try {
2284
+ const sigBuf = Buffer.from(signature);
2285
+ const expBuf = Buffer.from(expected);
2286
+ if (sigBuf.length !== expBuf.length || !timingSafeEqual(sigBuf, expBuf)) {
2287
+ return { status: 401, body: { error: "Invalid signature" } };
2288
+ }
2289
+ }
2290
+ catch {
2291
+ return { status: 401, body: { error: "Invalid signature" } };
2292
+ }
2293
+ }
2294
+ // P3 FIX: Reject oversized payloads before parsing to prevent memory exhaustion
2295
+ const MAX_TRIGGER_PAYLOAD_BYTES = 10_000_000; // 10MB
2296
+ if (rawBody.length > MAX_TRIGGER_PAYLOAD_BYTES) {
2297
+ return { status: 413, body: { error: `Trigger payload too large (${rawBody.length} bytes, max 10MB)` } };
2298
+ }
2299
+ let payload;
2300
+ try {
2301
+ payload = JSON.parse(rawBody);
2302
+ }
2303
+ catch {
2304
+ payload = { raw: rawBody };
2305
+ }
2306
+ if (endpoint.payload_transform && typeof endpoint.payload_transform === "object") {
2307
+ payload = resolveTemplate(endpoint.payload_transform, { steps: {}, trigger: payload });
2308
+ }
2309
+ // Update stats
2310
+ await supabase.from("webhook_endpoints").update({
2311
+ last_received_at: new Date().toISOString(),
2312
+ total_received: (endpoint.total_received || 0) + 1,
2313
+ }).eq("id", endpoint.id);
2314
+ // Audit
2315
+ const { error: whAuditErr } = await supabase.from("audit_logs").insert({
2316
+ action: "webhook.received", severity: "info", store_id: endpoint.store_id,
2317
+ resource_type: "webhook_endpoint", resource_id: endpoint.id, source: "webhook",
2318
+ details: { slug, workflow_id: endpoint.workflow_id },
2319
+ });
2320
+ if (whAuditErr)
2321
+ log.warn({ err: whAuditErr.message, slug }, "webhook audit insert failed");
2322
+ // Start workflow
2323
+ const { data: startResult } = await supabase.rpc("start_workflow_run", {
2324
+ p_workflow_id: endpoint.workflow_id, p_store_id: endpoint.store_id,
2325
+ p_trigger_type: "webhook", p_trigger_payload: payload,
2326
+ });
2327
+ if (!startResult?.success) {
2328
+ return { status: 422, body: { error: startResult?.error || "Failed to start workflow" } };
2329
+ }
2330
+ const runId = startResult.run_id;
2331
+ // Sync response — poll until workflow completes or timeout
2332
+ if (endpoint.sync_response) {
2333
+ const timeoutMs = (endpoint.sync_timeout_seconds || 30) * 1000;
2334
+ const deadline = Date.now() + timeoutMs;
2335
+ while (Date.now() < deadline) {
2336
+ await new Promise(r => setTimeout(r, 500));
2337
+ const { data: run } = await supabase.from("workflow_runs")
2338
+ .select("status, step_outputs, error_message").eq("id", runId).single();
2339
+ if (run?.status === "success") {
2340
+ return { status: 200, body: { success: true, run_id: runId, output: run.step_outputs } };
2341
+ }
2342
+ if (run?.status === "failed") {
2343
+ return { status: 422, body: { success: false, run_id: runId, error: run.error_message, output: run.step_outputs } };
2344
+ }
2345
+ }
2346
+ return { status: 202, body: { success: true, run_id: runId, status: "running", message: "Workflow still in progress" } };
2347
+ }
2348
+ return { status: 200, body: { success: true, run_id: runId, deduplicated: startResult.deduplicated || false } };
2349
+ }