machinaos 0.0.1 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.template +71 -71
- package/LICENSE +21 -21
- package/README.md +145 -87
- package/bin/cli.js +62 -106
- package/client/.dockerignore +45 -45
- package/client/Dockerfile +68 -68
- package/client/dist/assets/index-DFSC53FP.css +1 -0
- package/client/dist/assets/index-fJ-1gTf5.js +613 -0
- package/client/dist/index.html +14 -0
- package/client/eslint.config.js +34 -16
- package/client/nginx.conf +66 -66
- package/client/package.json +61 -48
- package/client/src/App.tsx +27 -27
- package/client/src/Dashboard.tsx +1200 -1172
- package/client/src/ParameterPanel.tsx +302 -300
- package/client/src/components/AIAgentNode.tsx +315 -321
- package/client/src/components/APIKeyValidator.tsx +117 -117
- package/client/src/components/ClaudeChatModelNode.tsx +17 -17
- package/client/src/components/CredentialsModal.tsx +1200 -306
- package/client/src/components/GeminiChatModelNode.tsx +17 -17
- package/client/src/components/GenericNode.tsx +356 -356
- package/client/src/components/LocationParameterPanel.tsx +153 -153
- package/client/src/components/ModelNode.tsx +285 -285
- package/client/src/components/OpenAIChatModelNode.tsx +17 -17
- package/client/src/components/OutputPanel.tsx +470 -470
- package/client/src/components/ParameterRenderer.tsx +1873 -1873
- package/client/src/components/SkillEditorModal.tsx +3 -3
- package/client/src/components/SquareNode.tsx +812 -796
- package/client/src/components/ToolkitNode.tsx +365 -365
- package/client/src/components/auth/LoginPage.tsx +247 -247
- package/client/src/components/auth/ProtectedRoute.tsx +59 -59
- package/client/src/components/base/BaseChatModelNode.tsx +270 -270
- package/client/src/components/icons/AIProviderIcons.tsx +50 -50
- package/client/src/components/maps/GoogleMapsPicker.tsx +136 -136
- package/client/src/components/maps/MapsPreviewPanel.tsx +109 -109
- package/client/src/components/maps/index.ts +25 -25
- package/client/src/components/parameterPanel/InputSection.tsx +1094 -1094
- package/client/src/components/parameterPanel/LocationPanelLayout.tsx +64 -64
- package/client/src/components/parameterPanel/MapsSection.tsx +91 -91
- package/client/src/components/parameterPanel/MiddleSection.tsx +867 -571
- package/client/src/components/parameterPanel/OutputSection.tsx +80 -80
- package/client/src/components/parameterPanel/ParameterPanelLayout.tsx +81 -81
- package/client/src/components/parameterPanel/ToolSchemaEditor.tsx +436 -436
- package/client/src/components/parameterPanel/index.ts +41 -41
- package/client/src/components/shared/DataPanel.tsx +142 -142
- package/client/src/components/shared/JSONTreeRenderer.tsx +105 -105
- package/client/src/components/ui/AIResultModal.tsx +203 -203
- package/client/src/components/ui/ApiKeyInput.tsx +93 -0
- package/client/src/components/ui/CodeEditor.tsx +81 -81
- package/client/src/components/ui/CollapsibleSection.tsx +87 -87
- package/client/src/components/ui/ComponentItem.tsx +153 -153
- package/client/src/components/ui/ComponentPalette.tsx +320 -320
- package/client/src/components/ui/ConsolePanel.tsx +151 -43
- package/client/src/components/ui/ErrorBoundary.tsx +195 -195
- package/client/src/components/ui/InputNodesPanel.tsx +203 -203
- package/client/src/components/ui/MapSelector.tsx +313 -313
- package/client/src/components/ui/Modal.tsx +151 -148
- package/client/src/components/ui/NodeOutputPanel.tsx +1150 -1150
- package/client/src/components/ui/OutputDisplayPanel.tsx +381 -381
- package/client/src/components/ui/QRCodeDisplay.tsx +182 -0
- package/client/src/components/ui/TopToolbar.tsx +736 -736
- package/client/src/components/ui/WorkflowSidebar.tsx +293 -293
- package/client/src/config/antdTheme.ts +186 -186
- package/client/src/contexts/AuthContext.tsx +221 -221
- package/client/src/contexts/ThemeContext.tsx +42 -42
- package/client/src/contexts/WebSocketContext.tsx +2144 -1971
- package/client/src/factories/baseChatModelFactory.ts +255 -255
- package/client/src/hooks/useAndroidOperations.ts +118 -164
- package/client/src/hooks/useApiKeyValidation.ts +106 -106
- package/client/src/hooks/useApiKeys.ts +238 -238
- package/client/src/hooks/useAppTheme.ts +17 -17
- package/client/src/hooks/useComponentPalette.ts +50 -50
- package/client/src/hooks/useDragAndDrop.ts +123 -123
- package/client/src/hooks/useDragVariable.ts +88 -88
- package/client/src/hooks/useExecution.ts +319 -313
- package/client/src/hooks/useParameterPanel.ts +176 -176
- package/client/src/hooks/useReactFlowNodes.ts +188 -188
- package/client/src/hooks/useToolSchema.ts +209 -209
- package/client/src/hooks/useWhatsApp.ts +196 -196
- package/client/src/hooks/useWorkflowManagement.ts +45 -45
- package/client/src/index.css +314 -314
- package/client/src/nodeDefinitions/aiAgentNodes.ts +335 -335
- package/client/src/nodeDefinitions/aiModelNodes.ts +340 -340
- package/client/src/nodeDefinitions/androidServiceNodes.ts +383 -383
- package/client/src/nodeDefinitions/chatNodes.ts +135 -135
- package/client/src/nodeDefinitions/codeNodes.ts +54 -54
- package/client/src/nodeDefinitions/index.ts +14 -14
- package/client/src/nodeDefinitions/locationNodes.ts +462 -462
- package/client/src/nodeDefinitions/schedulerNodes.ts +220 -220
- package/client/src/nodeDefinitions/skillNodes.ts +17 -5
- package/client/src/nodeDefinitions/utilityNodes.ts +284 -284
- package/client/src/nodeDefinitions/whatsappNodes.ts +821 -865
- package/client/src/nodeDefinitions.ts +101 -103
- package/client/src/services/dynamicParameterService.ts +95 -95
- package/client/src/services/execution/aiAgentExecutionService.ts +34 -34
- package/client/src/services/executionService.ts +227 -231
- package/client/src/services/workflowApi.ts +91 -91
- package/client/src/store/useAppStore.ts +578 -581
- package/client/src/styles/theme.ts +513 -508
- package/client/src/styles/zIndex.ts +16 -16
- package/client/src/types/ComponentTypes.ts +38 -38
- package/client/src/types/INodeProperties.ts +287 -287
- package/client/src/types/NodeTypes.ts +27 -27
- package/client/src/utils/formatters.ts +32 -32
- package/client/src/utils/googleMapsLoader.ts +139 -139
- package/client/src/utils/locationUtils.ts +84 -84
- package/client/src/utils/nodeUtils.ts +30 -30
- package/client/src/utils/workflow.ts +29 -29
- package/client/src/vite-env.d.ts +12 -12
- package/client/tailwind.config.js +59 -59
- package/client/tsconfig.json +25 -25
- package/client/vite.config.js +35 -35
- package/package.json +78 -70
- package/scripts/build.js +153 -45
- package/scripts/clean.js +40 -40
- package/scripts/start.js +234 -210
- package/scripts/stop.js +301 -325
- package/server/.dockerignore +44 -44
- package/server/Dockerfile +45 -45
- package/server/constants.py +244 -249
- package/server/core/cache.py +460 -460
- package/server/core/config.py +127 -127
- package/server/core/container.py +98 -98
- package/server/core/database.py +1296 -1210
- package/server/core/logging.py +313 -313
- package/server/main.py +288 -288
- package/server/middleware/__init__.py +5 -5
- package/server/middleware/auth.py +89 -89
- package/server/models/auth.py +52 -52
- package/server/models/cache.py +24 -24
- package/server/models/database.py +235 -210
- package/server/models/nodes.py +435 -455
- package/server/pyproject.toml +75 -72
- package/server/requirements.txt +83 -83
- package/server/routers/android.py +294 -294
- package/server/routers/auth.py +203 -203
- package/server/routers/database.py +150 -150
- package/server/routers/maps.py +141 -141
- package/server/routers/nodejs_compat.py +288 -288
- package/server/routers/webhook.py +90 -90
- package/server/routers/websocket.py +2239 -2127
- package/server/routers/whatsapp.py +761 -761
- package/server/routers/workflow.py +199 -199
- package/server/services/ai.py +2444 -2414
- package/server/services/android_service.py +588 -588
- package/server/services/auth.py +130 -130
- package/server/services/chat_client.py +160 -160
- package/server/services/deployment/manager.py +706 -706
- package/server/services/event_waiter.py +675 -785
- package/server/services/execution/executor.py +1351 -1351
- package/server/services/execution/models.py +1 -1
- package/server/services/handlers/__init__.py +122 -126
- package/server/services/handlers/ai.py +390 -355
- package/server/services/handlers/android.py +69 -260
- package/server/services/handlers/code.py +278 -278
- package/server/services/handlers/http.py +193 -193
- package/server/services/handlers/tools.py +146 -32
- package/server/services/handlers/triggers.py +107 -107
- package/server/services/handlers/utility.py +822 -822
- package/server/services/handlers/whatsapp.py +423 -476
- package/server/services/maps.py +288 -288
- package/server/services/memory_store.py +103 -103
- package/server/services/node_executor.py +372 -375
- package/server/services/scheduler.py +155 -155
- package/server/services/skill_loader.py +1 -1
- package/server/services/status_broadcaster.py +834 -826
- package/server/services/temporal/__init__.py +23 -23
- package/server/services/temporal/activities.py +344 -344
- package/server/services/temporal/client.py +76 -76
- package/server/services/temporal/executor.py +147 -147
- package/server/services/temporal/worker.py +251 -251
- package/server/services/temporal/workflow.py +355 -355
- package/server/services/temporal/ws_client.py +236 -236
- package/server/services/text.py +110 -110
- package/server/services/user_auth.py +172 -172
- package/server/services/websocket_client.py +29 -29
- package/server/services/workflow.py +597 -597
- package/server/skills/android-skill/SKILL.md +4 -4
- package/server/skills/code-skill/SKILL.md +123 -89
- package/server/skills/maps-skill/SKILL.md +3 -3
- package/server/skills/memory-skill/SKILL.md +1 -1
- package/server/skills/web-search-skill/SKILL.md +154 -0
- package/server/skills/whatsapp-skill/SKILL.md +3 -3
- package/server/uv.lock +461 -100
- package/server/whatsapp-rpc/.dockerignore +30 -30
- package/server/whatsapp-rpc/Dockerfile +44 -44
- package/server/whatsapp-rpc/Dockerfile.web +17 -17
- package/server/whatsapp-rpc/README.md +139 -139
- package/server/whatsapp-rpc/bin/whatsapp-rpc-server +0 -0
- package/server/whatsapp-rpc/cli.js +95 -95
- package/server/whatsapp-rpc/configs/config.yaml +6 -6
- package/server/whatsapp-rpc/docker-compose.yml +35 -35
- package/server/whatsapp-rpc/docs/API.md +410 -410
- package/server/whatsapp-rpc/node_modules/.package-lock.json +259 -0
- package/server/whatsapp-rpc/node_modules/chalk/license +9 -0
- package/server/whatsapp-rpc/node_modules/chalk/package.json +83 -0
- package/server/whatsapp-rpc/node_modules/chalk/readme.md +297 -0
- package/server/whatsapp-rpc/node_modules/chalk/source/index.d.ts +325 -0
- package/server/whatsapp-rpc/node_modules/chalk/source/index.js +225 -0
- package/server/whatsapp-rpc/node_modules/chalk/source/utilities.js +33 -0
- package/server/whatsapp-rpc/node_modules/chalk/source/vendor/ansi-styles/index.d.ts +236 -0
- package/server/whatsapp-rpc/node_modules/chalk/source/vendor/ansi-styles/index.js +223 -0
- package/server/whatsapp-rpc/node_modules/chalk/source/vendor/supports-color/browser.d.ts +1 -0
- package/server/whatsapp-rpc/node_modules/chalk/source/vendor/supports-color/browser.js +34 -0
- package/server/whatsapp-rpc/node_modules/chalk/source/vendor/supports-color/index.d.ts +55 -0
- package/server/whatsapp-rpc/node_modules/chalk/source/vendor/supports-color/index.js +190 -0
- package/server/whatsapp-rpc/node_modules/commander/LICENSE +22 -0
- package/server/whatsapp-rpc/node_modules/commander/Readme.md +1148 -0
- package/server/whatsapp-rpc/node_modules/commander/esm.mjs +16 -0
- package/server/whatsapp-rpc/node_modules/commander/index.js +26 -0
- package/server/whatsapp-rpc/node_modules/commander/lib/argument.js +145 -0
- package/server/whatsapp-rpc/node_modules/commander/lib/command.js +2179 -0
- package/server/whatsapp-rpc/node_modules/commander/lib/error.js +43 -0
- package/server/whatsapp-rpc/node_modules/commander/lib/help.js +462 -0
- package/server/whatsapp-rpc/node_modules/commander/lib/option.js +329 -0
- package/server/whatsapp-rpc/node_modules/commander/lib/suggestSimilar.js +100 -0
- package/server/whatsapp-rpc/node_modules/commander/package-support.json +16 -0
- package/server/whatsapp-rpc/node_modules/commander/package.json +80 -0
- package/server/whatsapp-rpc/node_modules/commander/typings/esm.d.mts +3 -0
- package/server/whatsapp-rpc/node_modules/commander/typings/index.d.ts +884 -0
- package/server/whatsapp-rpc/node_modules/cross-spawn/LICENSE +21 -0
- package/server/whatsapp-rpc/node_modules/cross-spawn/README.md +89 -0
- package/server/whatsapp-rpc/node_modules/cross-spawn/index.js +39 -0
- package/server/whatsapp-rpc/node_modules/cross-spawn/lib/enoent.js +59 -0
- package/server/whatsapp-rpc/node_modules/cross-spawn/lib/parse.js +91 -0
- package/server/whatsapp-rpc/node_modules/cross-spawn/lib/util/escape.js +47 -0
- package/server/whatsapp-rpc/node_modules/cross-spawn/lib/util/readShebang.js +23 -0
- package/server/whatsapp-rpc/node_modules/cross-spawn/lib/util/resolveCommand.js +52 -0
- package/server/whatsapp-rpc/node_modules/cross-spawn/package.json +73 -0
- package/server/whatsapp-rpc/node_modules/execa/index.d.ts +955 -0
- package/server/whatsapp-rpc/node_modules/execa/index.js +309 -0
- package/server/whatsapp-rpc/node_modules/execa/lib/command.js +119 -0
- package/server/whatsapp-rpc/node_modules/execa/lib/error.js +87 -0
- package/server/whatsapp-rpc/node_modules/execa/lib/kill.js +102 -0
- package/server/whatsapp-rpc/node_modules/execa/lib/pipe.js +42 -0
- package/server/whatsapp-rpc/node_modules/execa/lib/promise.js +36 -0
- package/server/whatsapp-rpc/node_modules/execa/lib/stdio.js +49 -0
- package/server/whatsapp-rpc/node_modules/execa/lib/stream.js +133 -0
- package/server/whatsapp-rpc/node_modules/execa/lib/verbose.js +19 -0
- package/server/whatsapp-rpc/node_modules/execa/license +9 -0
- package/server/whatsapp-rpc/node_modules/execa/package.json +90 -0
- package/server/whatsapp-rpc/node_modules/execa/readme.md +822 -0
- package/server/whatsapp-rpc/node_modules/get-stream/license +9 -0
- package/server/whatsapp-rpc/node_modules/get-stream/package.json +53 -0
- package/server/whatsapp-rpc/node_modules/get-stream/readme.md +291 -0
- package/server/whatsapp-rpc/node_modules/get-stream/source/array-buffer.js +84 -0
- package/server/whatsapp-rpc/node_modules/get-stream/source/array.js +32 -0
- package/server/whatsapp-rpc/node_modules/get-stream/source/buffer.js +20 -0
- package/server/whatsapp-rpc/node_modules/get-stream/source/contents.js +101 -0
- package/server/whatsapp-rpc/node_modules/get-stream/source/index.d.ts +119 -0
- package/server/whatsapp-rpc/node_modules/get-stream/source/index.js +5 -0
- package/server/whatsapp-rpc/node_modules/get-stream/source/string.js +36 -0
- package/server/whatsapp-rpc/node_modules/get-stream/source/utils.js +11 -0
- package/server/whatsapp-rpc/node_modules/get-them-args/LICENSE +21 -0
- package/server/whatsapp-rpc/node_modules/get-them-args/README.md +95 -0
- package/server/whatsapp-rpc/node_modules/get-them-args/index.js +97 -0
- package/server/whatsapp-rpc/node_modules/get-them-args/package.json +36 -0
- package/server/whatsapp-rpc/node_modules/human-signals/LICENSE +201 -0
- package/server/whatsapp-rpc/node_modules/human-signals/README.md +168 -0
- package/server/whatsapp-rpc/node_modules/human-signals/build/src/core.js +273 -0
- package/server/whatsapp-rpc/node_modules/human-signals/build/src/main.d.ts +73 -0
- package/server/whatsapp-rpc/node_modules/human-signals/build/src/main.js +70 -0
- package/server/whatsapp-rpc/node_modules/human-signals/build/src/realtime.js +16 -0
- package/server/whatsapp-rpc/node_modules/human-signals/build/src/signals.js +34 -0
- package/server/whatsapp-rpc/node_modules/human-signals/package.json +61 -0
- package/server/whatsapp-rpc/node_modules/is-stream/index.d.ts +81 -0
- package/server/whatsapp-rpc/node_modules/is-stream/index.js +29 -0
- package/server/whatsapp-rpc/node_modules/is-stream/license +9 -0
- package/server/whatsapp-rpc/node_modules/is-stream/package.json +44 -0
- package/server/whatsapp-rpc/node_modules/is-stream/readme.md +60 -0
- package/server/whatsapp-rpc/node_modules/isexe/LICENSE +15 -0
- package/server/whatsapp-rpc/node_modules/isexe/README.md +51 -0
- package/server/whatsapp-rpc/node_modules/isexe/index.js +57 -0
- package/server/whatsapp-rpc/node_modules/isexe/mode.js +41 -0
- package/server/whatsapp-rpc/node_modules/isexe/package.json +31 -0
- package/server/whatsapp-rpc/node_modules/isexe/test/basic.js +221 -0
- package/server/whatsapp-rpc/node_modules/isexe/windows.js +42 -0
- package/server/whatsapp-rpc/node_modules/kill-port/.editorconfig +12 -0
- package/server/whatsapp-rpc/node_modules/kill-port/.gitattributes +1 -0
- package/server/whatsapp-rpc/node_modules/kill-port/LICENSE +21 -0
- package/server/whatsapp-rpc/node_modules/kill-port/README.md +140 -0
- package/server/whatsapp-rpc/node_modules/kill-port/cli.js +25 -0
- package/server/whatsapp-rpc/node_modules/kill-port/example.js +21 -0
- package/server/whatsapp-rpc/node_modules/kill-port/index.js +46 -0
- package/server/whatsapp-rpc/node_modules/kill-port/logo.png +0 -0
- package/server/whatsapp-rpc/node_modules/kill-port/package.json +41 -0
- package/server/whatsapp-rpc/node_modules/kill-port/pnpm-lock.yaml +4606 -0
- package/server/whatsapp-rpc/node_modules/kill-port/test.js +16 -0
- package/server/whatsapp-rpc/node_modules/merge-stream/LICENSE +21 -0
- package/server/whatsapp-rpc/node_modules/merge-stream/README.md +78 -0
- package/server/whatsapp-rpc/node_modules/merge-stream/index.js +41 -0
- package/server/whatsapp-rpc/node_modules/merge-stream/package.json +19 -0
- package/server/whatsapp-rpc/node_modules/mimic-fn/index.d.ts +52 -0
- package/server/whatsapp-rpc/node_modules/mimic-fn/index.js +71 -0
- package/server/whatsapp-rpc/node_modules/mimic-fn/license +9 -0
- package/server/whatsapp-rpc/node_modules/mimic-fn/package.json +45 -0
- package/server/whatsapp-rpc/node_modules/mimic-fn/readme.md +90 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/index.d.ts +90 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/index.js +52 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/license +9 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/node_modules/path-key/index.d.ts +31 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/node_modules/path-key/index.js +12 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/node_modules/path-key/license +9 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/node_modules/path-key/package.json +41 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/node_modules/path-key/readme.md +57 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/package.json +49 -0
- package/server/whatsapp-rpc/node_modules/npm-run-path/readme.md +104 -0
- package/server/whatsapp-rpc/node_modules/onetime/index.d.ts +59 -0
- package/server/whatsapp-rpc/node_modules/onetime/index.js +41 -0
- package/server/whatsapp-rpc/node_modules/onetime/license +9 -0
- package/server/whatsapp-rpc/node_modules/onetime/package.json +45 -0
- package/server/whatsapp-rpc/node_modules/onetime/readme.md +94 -0
- package/server/whatsapp-rpc/node_modules/path-key/index.d.ts +40 -0
- package/server/whatsapp-rpc/node_modules/path-key/index.js +16 -0
- package/server/whatsapp-rpc/node_modules/path-key/license +9 -0
- package/server/whatsapp-rpc/node_modules/path-key/package.json +39 -0
- package/server/whatsapp-rpc/node_modules/path-key/readme.md +61 -0
- package/server/whatsapp-rpc/node_modules/shebang-command/index.js +19 -0
- package/server/whatsapp-rpc/node_modules/shebang-command/license +9 -0
- package/server/whatsapp-rpc/node_modules/shebang-command/package.json +34 -0
- package/server/whatsapp-rpc/node_modules/shebang-command/readme.md +34 -0
- package/server/whatsapp-rpc/node_modules/shebang-regex/index.d.ts +22 -0
- package/server/whatsapp-rpc/node_modules/shebang-regex/index.js +2 -0
- package/server/whatsapp-rpc/node_modules/shebang-regex/license +9 -0
- package/server/whatsapp-rpc/node_modules/shebang-regex/package.json +35 -0
- package/server/whatsapp-rpc/node_modules/shebang-regex/readme.md +33 -0
- package/server/whatsapp-rpc/node_modules/shell-exec/LICENSE +21 -0
- package/server/whatsapp-rpc/node_modules/shell-exec/README.md +60 -0
- package/server/whatsapp-rpc/node_modules/shell-exec/index.js +47 -0
- package/server/whatsapp-rpc/node_modules/shell-exec/package.json +29 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/LICENSE.txt +16 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/README.md +74 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/browser.d.ts +12 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/browser.d.ts.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/browser.js +10 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/browser.js.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/index.d.ts +48 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/index.d.ts.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/index.js +279 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/index.js.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/package.json +3 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/signals.d.ts +29 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/signals.d.ts.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/signals.js +42 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/cjs/signals.js.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/browser.d.ts +12 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/browser.d.ts.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/browser.js +4 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/browser.js.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/index.d.ts +48 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/index.d.ts.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/index.js +275 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/index.js.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/package.json +3 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/signals.d.ts +29 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/signals.d.ts.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/signals.js +39 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/dist/mjs/signals.js.map +1 -0
- package/server/whatsapp-rpc/node_modules/signal-exit/package.json +106 -0
- package/server/whatsapp-rpc/node_modules/strip-final-newline/index.js +14 -0
- package/server/whatsapp-rpc/node_modules/strip-final-newline/license +9 -0
- package/server/whatsapp-rpc/node_modules/strip-final-newline/package.json +43 -0
- package/server/whatsapp-rpc/node_modules/strip-final-newline/readme.md +35 -0
- package/server/whatsapp-rpc/node_modules/which/CHANGELOG.md +166 -0
- package/server/whatsapp-rpc/node_modules/which/LICENSE +15 -0
- package/server/whatsapp-rpc/node_modules/which/README.md +54 -0
- package/server/whatsapp-rpc/node_modules/which/bin/node-which +52 -0
- package/server/whatsapp-rpc/node_modules/which/package.json +43 -0
- package/server/whatsapp-rpc/node_modules/which/which.js +125 -0
- package/server/whatsapp-rpc/package-lock.json +272 -0
- package/server/whatsapp-rpc/package.json +30 -30
- package/server/whatsapp-rpc/schema.json +1294 -1294
- package/server/whatsapp-rpc/scripts/clean.cjs +66 -66
- package/server/whatsapp-rpc/scripts/cli.js +162 -162
- package/server/whatsapp-rpc/src/go/whatsapp/history.go +166 -166
- package/server/whatsapp-rpc/src/python/pyproject.toml +15 -15
- package/server/whatsapp-rpc/src/python/whatsapp_rpc/__init__.py +4 -4
- package/server/whatsapp-rpc/src/python/whatsapp_rpc/client.py +427 -427
- package/server/whatsapp-rpc/web/app.py +609 -609
- package/server/whatsapp-rpc/web/requirements.txt +6 -6
- package/server/whatsapp-rpc/web/rpc_client.py +427 -427
- package/server/whatsapp-rpc/web/static/openapi.yaml +59 -59
- package/server/whatsapp-rpc/web/templates/base.html +149 -149
- package/server/whatsapp-rpc/web/templates/contacts.html +240 -240
- package/server/whatsapp-rpc/web/templates/dashboard.html +319 -319
- package/server/whatsapp-rpc/web/templates/groups.html +328 -328
- package/server/whatsapp-rpc/web/templates/messages.html +465 -465
- package/server/whatsapp-rpc/web/templates/messaging.html +680 -680
- package/server/whatsapp-rpc/web/templates/send.html +258 -258
- package/server/whatsapp-rpc/web/templates/settings.html +459 -459
- package/client/src/components/ui/AndroidSettingsPanel.tsx +0 -401
- package/client/src/components/ui/WhatsAppSettingsPanel.tsx +0 -345
- package/client/src/nodeDefinitions/androidDeviceNodes.ts +0 -140
- package/docker-compose.prod.yml +0 -107
- package/docker-compose.yml +0 -104
- package/docs-MachinaOs/README.md +0 -85
- package/docs-MachinaOs/deployment/docker.mdx +0 -228
- package/docs-MachinaOs/deployment/production.mdx +0 -345
- package/docs-MachinaOs/docs.json +0 -75
- package/docs-MachinaOs/faq.mdx +0 -309
- package/docs-MachinaOs/favicon.svg +0 -5
- package/docs-MachinaOs/installation.mdx +0 -160
- package/docs-MachinaOs/introduction.mdx +0 -114
- package/docs-MachinaOs/logo/dark.svg +0 -6
- package/docs-MachinaOs/logo/light.svg +0 -6
- package/docs-MachinaOs/nodes/ai-agent.mdx +0 -216
- package/docs-MachinaOs/nodes/ai-models.mdx +0 -240
- package/docs-MachinaOs/nodes/android.mdx +0 -411
- package/docs-MachinaOs/nodes/overview.mdx +0 -181
- package/docs-MachinaOs/nodes/schedulers.mdx +0 -316
- package/docs-MachinaOs/nodes/webhooks.mdx +0 -330
- package/docs-MachinaOs/nodes/whatsapp.mdx +0 -305
- package/docs-MachinaOs/quickstart.mdx +0 -119
- package/docs-MachinaOs/tutorials/ai-agent-workflow.mdx +0 -177
- package/docs-MachinaOs/tutorials/android-automation.mdx +0 -242
- package/docs-MachinaOs/tutorials/first-workflow.mdx +0 -134
- package/docs-MachinaOs/tutorials/whatsapp-automation.mdx +0 -185
- package/nul +0 -0
- package/scripts/check-ports.ps1 +0 -33
- package/scripts/kill-port.ps1 +0 -154
|
@@ -1,1351 +1,1351 @@
|
|
|
1
|
-
"""Workflow executor with Conductor decide pattern and parallel execution.
|
|
2
|
-
|
|
3
|
-
Implements:
|
|
4
|
-
- Conductor-style workflow_decide() for orchestration
|
|
5
|
-
- Prefect-style task caching for idempotency
|
|
6
|
-
- Fork/Join parallel execution with asyncio.wait (FIRST_COMPLETED pattern)
|
|
7
|
-
- Dynamic workflow branching at runtime
|
|
8
|
-
- Proper handling of long-running trigger nodes in parallel batches
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import asyncio
|
|
12
|
-
import time
|
|
13
|
-
from collections import defaultdict
|
|
14
|
-
from datetime import datetime
|
|
15
|
-
from typing import Dict, Any, List, Optional, Callable, Awaitable, Set
|
|
16
|
-
|
|
17
|
-
from core.logging import get_logger
|
|
18
|
-
from constants import WORKFLOW_TRIGGER_TYPES
|
|
19
|
-
from .models import (
|
|
20
|
-
ExecutionContext,
|
|
21
|
-
TaskStatus,
|
|
22
|
-
WorkflowStatus,
|
|
23
|
-
NodeExecution,
|
|
24
|
-
hash_inputs,
|
|
25
|
-
RetryPolicy,
|
|
26
|
-
get_retry_policy,
|
|
27
|
-
)
|
|
28
|
-
from .cache import ExecutionCache
|
|
29
|
-
from .conditions import evaluate_condition, decide_next_edges
|
|
30
|
-
from .dlq import create_dlq_handler, DLQHandlerProtocol, NullDLQHandler
|
|
31
|
-
|
|
32
|
-
logger = get_logger(__name__)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def is_trigger_node(node_type: str) -> bool:
|
|
36
|
-
"""Check if a node type is a trigger node (workflow starting point).
|
|
37
|
-
|
|
38
|
-
Trigger nodes have no input handles and serve as entry points for workflows.
|
|
39
|
-
They are identified by WORKFLOW_TRIGGER_TYPES in constants.py.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
node_type: The node type string
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
True if the node is a trigger type
|
|
46
|
-
"""
|
|
47
|
-
return node_type in WORKFLOW_TRIGGER_TYPES
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class WorkflowExecutor:
|
|
51
|
-
"""Executes workflows using Conductor decide pattern with parallel execution.
|
|
52
|
-
|
|
53
|
-
Features:
|
|
54
|
-
- Isolated ExecutionContext per workflow run
|
|
55
|
-
- Parallel execution of independent nodes (Fork/Join)
|
|
56
|
-
- Result caching for idempotency (Prefect pattern)
|
|
57
|
-
- Distributed locking to prevent race conditions
|
|
58
|
-
- Event history for debugging and recovery
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
def __init__(self, cache: ExecutionCache,
|
|
62
|
-
node_executor: Callable[[str, str, Dict, Dict], Awaitable[Dict]],
|
|
63
|
-
status_callback: Callable[[str, str, Dict], Awaitable[None]] = None,
|
|
64
|
-
dlq_enabled: bool = False):
|
|
65
|
-
"""Initialize executor.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
cache: ExecutionCache for Redis persistence
|
|
69
|
-
node_executor: Async function to execute a single node
|
|
70
|
-
Signature: async def execute(node_id, node_type, params, context) -> result
|
|
71
|
-
status_callback: Optional async callback for status updates
|
|
72
|
-
Signature: async def callback(node_id, status, data)
|
|
73
|
-
dlq_enabled: Whether to add failed nodes to Dead Letter Queue
|
|
74
|
-
"""
|
|
75
|
-
self.cache = cache
|
|
76
|
-
self.node_executor = node_executor
|
|
77
|
-
self.status_callback = status_callback
|
|
78
|
-
|
|
79
|
-
# Create DLQ handler (modular - uses Null Object pattern when disabled)
|
|
80
|
-
self.dlq = create_dlq_handler(cache, enabled=dlq_enabled)
|
|
81
|
-
|
|
82
|
-
# Active executions (in-memory for fast lookup)
|
|
83
|
-
self._active_contexts: Dict[str, ExecutionContext] = {}
|
|
84
|
-
|
|
85
|
-
# =========================================================================
|
|
86
|
-
# EXECUTION ENTRY POINTS
|
|
87
|
-
# =========================================================================
|
|
88
|
-
|
|
89
|
-
async def execute_workflow(self, workflow_id: str, nodes: List[Dict],
|
|
90
|
-
edges: List[Dict], session_id: str = "default",
|
|
91
|
-
enable_caching: bool = True) -> Dict[str, Any]:
|
|
92
|
-
"""Execute a workflow with parallel node execution.
|
|
93
|
-
|
|
94
|
-
Args:
|
|
95
|
-
workflow_id: Workflow identifier
|
|
96
|
-
nodes: List of workflow nodes
|
|
97
|
-
edges: List of edges connecting nodes
|
|
98
|
-
session_id: Session identifier
|
|
99
|
-
enable_caching: Whether to use result caching
|
|
100
|
-
|
|
101
|
-
Returns:
|
|
102
|
-
Execution result dict
|
|
103
|
-
"""
|
|
104
|
-
start_time = time.time()
|
|
105
|
-
|
|
106
|
-
# Create isolated execution context
|
|
107
|
-
ctx = ExecutionContext.create(
|
|
108
|
-
workflow_id=workflow_id,
|
|
109
|
-
session_id=session_id,
|
|
110
|
-
nodes=nodes,
|
|
111
|
-
edges=edges,
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
# Compute execution layers (for parallel batches)
|
|
115
|
-
ctx.execution_order = self._compute_execution_layers(nodes, edges)
|
|
116
|
-
|
|
117
|
-
logger.info("Starting workflow execution",
|
|
118
|
-
execution_id=ctx.execution_id,
|
|
119
|
-
workflow_id=workflow_id,
|
|
120
|
-
node_count=len(nodes),
|
|
121
|
-
layers=len(ctx.execution_order))
|
|
122
|
-
|
|
123
|
-
# Track in memory
|
|
124
|
-
self._active_contexts[ctx.execution_id] = ctx
|
|
125
|
-
|
|
126
|
-
# Persist initial state
|
|
127
|
-
ctx.status = WorkflowStatus.RUNNING
|
|
128
|
-
ctx.started_at = time.time()
|
|
129
|
-
await self.cache.save_execution_state(ctx)
|
|
130
|
-
|
|
131
|
-
# Add workflow_started event
|
|
132
|
-
await self.cache.add_event(ctx.execution_id, "workflow_started", {
|
|
133
|
-
"workflow_id": workflow_id,
|
|
134
|
-
"node_count": len(nodes),
|
|
135
|
-
})
|
|
136
|
-
|
|
137
|
-
try:
|
|
138
|
-
# Run the decide loop
|
|
139
|
-
await self._workflow_decide(ctx, enable_caching)
|
|
140
|
-
|
|
141
|
-
# Determine final status
|
|
142
|
-
if ctx.all_nodes_complete():
|
|
143
|
-
ctx.status = WorkflowStatus.COMPLETED
|
|
144
|
-
elif ctx.errors:
|
|
145
|
-
ctx.status = WorkflowStatus.FAILED
|
|
146
|
-
|
|
147
|
-
ctx.completed_at = time.time()
|
|
148
|
-
await self.cache.save_execution_state(ctx)
|
|
149
|
-
|
|
150
|
-
# Add workflow_completed event
|
|
151
|
-
await self.cache.add_event(ctx.execution_id, "workflow_completed", {
|
|
152
|
-
"status": ctx.status.value,
|
|
153
|
-
"completed_nodes": len(ctx.get_completed_nodes()),
|
|
154
|
-
"execution_time": ctx.completed_at - ctx.started_at,
|
|
155
|
-
})
|
|
156
|
-
|
|
157
|
-
return {
|
|
158
|
-
"success": ctx.status == WorkflowStatus.COMPLETED,
|
|
159
|
-
"execution_id": ctx.execution_id,
|
|
160
|
-
"status": ctx.status.value,
|
|
161
|
-
"nodes_executed": ctx.get_completed_nodes(),
|
|
162
|
-
"outputs": ctx.outputs,
|
|
163
|
-
"errors": ctx.errors,
|
|
164
|
-
"execution_time": time.time() - start_time,
|
|
165
|
-
"timestamp": datetime.now().isoformat(),
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
except asyncio.CancelledError:
|
|
169
|
-
ctx.status = WorkflowStatus.CANCELLED
|
|
170
|
-
ctx.completed_at = time.time()
|
|
171
|
-
await self.cache.save_execution_state(ctx)
|
|
172
|
-
await self.cache.add_event(ctx.execution_id, "workflow_cancelled", {})
|
|
173
|
-
return {
|
|
174
|
-
"success": False,
|
|
175
|
-
"execution_id": ctx.execution_id,
|
|
176
|
-
"status": "cancelled",
|
|
177
|
-
"error": "Cancelled by user",
|
|
178
|
-
"execution_time": time.time() - start_time,
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
except Exception as e:
|
|
182
|
-
logger.error("Workflow execution failed", execution_id=ctx.execution_id,
|
|
183
|
-
error=str(e))
|
|
184
|
-
ctx.status = WorkflowStatus.FAILED
|
|
185
|
-
ctx.errors.append({"error": str(e), "timestamp": time.time()})
|
|
186
|
-
await self.cache.save_execution_state(ctx)
|
|
187
|
-
await self.cache.add_event(ctx.execution_id, "workflow_failed", {
|
|
188
|
-
"error": str(e),
|
|
189
|
-
})
|
|
190
|
-
return {
|
|
191
|
-
"success": False,
|
|
192
|
-
"execution_id": ctx.execution_id,
|
|
193
|
-
"status": "failed",
|
|
194
|
-
"error": str(e),
|
|
195
|
-
"execution_time": time.time() - start_time,
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
finally:
|
|
199
|
-
# Cleanup
|
|
200
|
-
self._active_contexts.pop(ctx.execution_id, None)
|
|
201
|
-
|
|
202
|
-
async def cancel_execution(self, execution_id: str) -> bool:
|
|
203
|
-
"""Cancel a running execution.
|
|
204
|
-
|
|
205
|
-
Args:
|
|
206
|
-
execution_id: Execution to cancel
|
|
207
|
-
|
|
208
|
-
Returns:
|
|
209
|
-
True if cancelled successfully
|
|
210
|
-
"""
|
|
211
|
-
ctx = self._active_contexts.get(execution_id)
|
|
212
|
-
if ctx:
|
|
213
|
-
ctx.status = WorkflowStatus.CANCELLED
|
|
214
|
-
for node_exec in ctx.node_executions.values():
|
|
215
|
-
if node_exec.status in (TaskStatus.PENDING, TaskStatus.SCHEDULED,
|
|
216
|
-
TaskStatus.RUNNING, TaskStatus.WAITING):
|
|
217
|
-
node_exec.status = TaskStatus.CANCELLED
|
|
218
|
-
await self.cache.save_execution_state(ctx)
|
|
219
|
-
logger.info("Execution cancelled", execution_id=execution_id)
|
|
220
|
-
return True
|
|
221
|
-
return False
|
|
222
|
-
|
|
223
|
-
# =========================================================================
|
|
224
|
-
# CONDUCTOR DECIDE PATTERN
|
|
225
|
-
# =========================================================================
|
|
226
|
-
|
|
227
|
-
async def _workflow_decide(self, ctx: ExecutionContext,
|
|
228
|
-
enable_caching: bool = True) -> None:
|
|
229
|
-
"""Core orchestration loop - Conductor's decide pattern.
|
|
230
|
-
|
|
231
|
-
Evaluates current state, finds ready nodes, executes them in parallel,
|
|
232
|
-
then recurses until all nodes complete or error occurs.
|
|
233
|
-
|
|
234
|
-
Args:
|
|
235
|
-
ctx: ExecutionContext to process
|
|
236
|
-
enable_caching: Whether to use result caching
|
|
237
|
-
"""
|
|
238
|
-
# Distributed lock prevents concurrent decides for same execution
|
|
239
|
-
try:
|
|
240
|
-
async with self.cache.distributed_lock(
|
|
241
|
-
f"execution:{ctx.execution_id}:decide", timeout=60
|
|
242
|
-
):
|
|
243
|
-
await self._decide_iteration(ctx, enable_caching)
|
|
244
|
-
except TimeoutError:
|
|
245
|
-
logger.warning("Could not acquire decide lock",
|
|
246
|
-
execution_id=ctx.execution_id)
|
|
247
|
-
# Retry after short delay
|
|
248
|
-
await asyncio.sleep(0.5)
|
|
249
|
-
await self._workflow_decide(ctx, enable_caching)
|
|
250
|
-
|
|
251
|
-
async def _decide_iteration(self, ctx: ExecutionContext,
|
|
252
|
-
enable_caching: bool) -> None:
|
|
253
|
-
"""Continuous scheduling loop - Temporal/Conductor pattern.
|
|
254
|
-
|
|
255
|
-
When any node completes, immediately check for newly-ready dependents
|
|
256
|
-
and start them without waiting for entire layer to complete.
|
|
257
|
-
|
|
258
|
-
Example: Cron3 (5s) completes -> immediately start WS3,
|
|
259
|
-
even while Cron1 (20s) is still running.
|
|
260
|
-
"""
|
|
261
|
-
# Check if cancelled
|
|
262
|
-
if ctx.status == WorkflowStatus.CANCELLED:
|
|
263
|
-
return
|
|
264
|
-
|
|
265
|
-
# Find initial ready nodes
|
|
266
|
-
ready_nodes = self._find_ready_nodes(ctx)
|
|
267
|
-
|
|
268
|
-
if not ready_nodes:
|
|
269
|
-
if ctx.all_nodes_complete():
|
|
270
|
-
logger.info("All nodes complete", execution_id=ctx.execution_id)
|
|
271
|
-
else:
|
|
272
|
-
pending = ctx.get_pending_nodes()
|
|
273
|
-
if pending:
|
|
274
|
-
logger.warning("Stuck: pending nodes with unsatisfied deps",
|
|
275
|
-
execution_id=ctx.execution_id,
|
|
276
|
-
pending=pending)
|
|
277
|
-
return
|
|
278
|
-
|
|
279
|
-
logger.info("Starting continuous execution",
|
|
280
|
-
execution_id=ctx.execution_id,
|
|
281
|
-
initial_batch=len(ready_nodes),
|
|
282
|
-
nodes=[n.node_id for n in ready_nodes])
|
|
283
|
-
|
|
284
|
-
# Execute with continuous scheduling - new pattern
|
|
285
|
-
await self._execute_with_continuous_scheduling(ctx, ready_nodes, enable_caching)
|
|
286
|
-
|
|
287
|
-
# Save final state
|
|
288
|
-
await self.cache.save_execution_state(ctx)
|
|
289
|
-
|
|
290
|
-
# =========================================================================
|
|
291
|
-
# CONTINUOUS SCHEDULING (Temporal/Conductor Pattern)
|
|
292
|
-
# =========================================================================
|
|
293
|
-
|
|
294
|
-
async def _execute_with_continuous_scheduling(
|
|
295
|
-
self,
|
|
296
|
-
ctx: ExecutionContext,
|
|
297
|
-
initial_nodes: List[NodeExecution],
|
|
298
|
-
enable_caching: bool
|
|
299
|
-
) -> None:
|
|
300
|
-
"""Execute workflow with continuous scheduling.
|
|
301
|
-
|
|
302
|
-
Modern pattern: When any node completes, immediately check for and start
|
|
303
|
-
newly-ready dependent nodes. This enables true parallel pipelines where
|
|
304
|
-
each path progresses independently.
|
|
305
|
-
|
|
306
|
-
Uses asyncio.wait(FIRST_COMPLETED) to process completions immediately.
|
|
307
|
-
|
|
308
|
-
Args:
|
|
309
|
-
ctx: ExecutionContext
|
|
310
|
-
initial_nodes: Initial batch of ready nodes
|
|
311
|
-
enable_caching: Whether to use result caching
|
|
312
|
-
"""
|
|
313
|
-
# Track all running tasks: task -> NodeExecution
|
|
314
|
-
task_to_node: Dict[asyncio.Task, NodeExecution] = {}
|
|
315
|
-
pending_tasks: Set[asyncio.Task] = set()
|
|
316
|
-
workflow_failed = False
|
|
317
|
-
|
|
318
|
-
def create_node_task(node: NodeExecution) -> asyncio.Task:
|
|
319
|
-
"""Create and track a task for node execution."""
|
|
320
|
-
node.status = TaskStatus.SCHEDULED
|
|
321
|
-
task = asyncio.create_task(
|
|
322
|
-
self._execute_node_with_retry(ctx, node, enable_caching),
|
|
323
|
-
name=f"node_{node.node_id}"
|
|
324
|
-
)
|
|
325
|
-
task_to_node[task] = node
|
|
326
|
-
pending_tasks.add(task)
|
|
327
|
-
return task
|
|
328
|
-
|
|
329
|
-
# Start initial nodes
|
|
330
|
-
for node in initial_nodes:
|
|
331
|
-
create_node_task(node)
|
|
332
|
-
await self._notify_status(node.node_id, "scheduled", {})
|
|
333
|
-
logger.info("Scheduled node", node_id=node.node_id)
|
|
334
|
-
|
|
335
|
-
# Process completions and schedule new nodes continuously
|
|
336
|
-
while pending_tasks and not workflow_failed:
|
|
337
|
-
if ctx.status == WorkflowStatus.CANCELLED:
|
|
338
|
-
# Cancel all pending tasks
|
|
339
|
-
for task in pending_tasks:
|
|
340
|
-
task.cancel()
|
|
341
|
-
break
|
|
342
|
-
|
|
343
|
-
# Wait for ANY task to complete
|
|
344
|
-
done, pending_tasks = await asyncio.wait(
|
|
345
|
-
pending_tasks,
|
|
346
|
-
return_when=asyncio.FIRST_COMPLETED
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
# Process each completed task
|
|
350
|
-
for task in done:
|
|
351
|
-
node = task_to_node[task]
|
|
352
|
-
newly_ready = []
|
|
353
|
-
|
|
354
|
-
try:
|
|
355
|
-
result = task.result()
|
|
356
|
-
|
|
357
|
-
if isinstance(result, Exception):
|
|
358
|
-
node.status = TaskStatus.FAILED
|
|
359
|
-
node.error = str(result)
|
|
360
|
-
node.completed_at = time.time()
|
|
361
|
-
ctx.errors.append({
|
|
362
|
-
"node_id": node.node_id,
|
|
363
|
-
"error": str(result),
|
|
364
|
-
"timestamp": time.time(),
|
|
365
|
-
})
|
|
366
|
-
await self._notify_status(node.node_id, "error", {"error": str(result)})
|
|
367
|
-
logger.error("Node failed", node_id=node.node_id, error=str(result))
|
|
368
|
-
workflow_failed = True
|
|
369
|
-
|
|
370
|
-
elif result.get("retries_exhausted"):
|
|
371
|
-
node.status = TaskStatus.FAILED
|
|
372
|
-
node.error = result.get("error", "Unknown error")
|
|
373
|
-
node.completed_at = time.time()
|
|
374
|
-
ctx.errors.append({
|
|
375
|
-
"node_id": node.node_id,
|
|
376
|
-
"error": node.error,
|
|
377
|
-
"retries_exhausted": True,
|
|
378
|
-
"timestamp": time.time(),
|
|
379
|
-
})
|
|
380
|
-
workflow_failed = True
|
|
381
|
-
|
|
382
|
-
elif not result.get("success"):
|
|
383
|
-
node.status = TaskStatus.FAILED
|
|
384
|
-
node.error = result.get("error", "Unknown error")
|
|
385
|
-
node.completed_at = time.time()
|
|
386
|
-
ctx.errors.append({
|
|
387
|
-
"node_id": node.node_id,
|
|
388
|
-
"error": node.error,
|
|
389
|
-
"timestamp": time.time(),
|
|
390
|
-
})
|
|
391
|
-
await self._notify_status(node.node_id, "error", {"error": node.error})
|
|
392
|
-
logger.error("Node failed", node_id=node.node_id, error=node.error)
|
|
393
|
-
workflow_failed = True
|
|
394
|
-
|
|
395
|
-
else:
|
|
396
|
-
# Success - checkpoint and find newly ready nodes
|
|
397
|
-
ctx.add_checkpoint(node.node_id)
|
|
398
|
-
logger.info("Node completed", node_id=node.node_id)
|
|
399
|
-
|
|
400
|
-
# Find nodes that are now ready (their dependencies just completed)
|
|
401
|
-
newly_ready = self._find_ready_nodes(ctx)
|
|
402
|
-
|
|
403
|
-
except asyncio.CancelledError:
|
|
404
|
-
node.status = TaskStatus.CANCELLED
|
|
405
|
-
node.completed_at = time.time()
|
|
406
|
-
logger.info("Node cancelled", node_id=node.node_id)
|
|
407
|
-
|
|
408
|
-
except Exception as e:
|
|
409
|
-
node.status = TaskStatus.FAILED
|
|
410
|
-
node.error = str(e)
|
|
411
|
-
node.completed_at = time.time()
|
|
412
|
-
ctx.errors.append({
|
|
413
|
-
"node_id": node.node_id,
|
|
414
|
-
"error": str(e),
|
|
415
|
-
"timestamp": time.time(),
|
|
416
|
-
})
|
|
417
|
-
await self._notify_status(node.node_id, "error", {"error": str(e)})
|
|
418
|
-
logger.error("Node exception", node_id=node.node_id, error=str(e))
|
|
419
|
-
workflow_failed = True
|
|
420
|
-
|
|
421
|
-
# Schedule newly ready nodes immediately
|
|
422
|
-
if newly_ready and not workflow_failed:
|
|
423
|
-
for ready_node in newly_ready:
|
|
424
|
-
create_node_task(ready_node)
|
|
425
|
-
await self._notify_status(ready_node.node_id, "scheduled", {})
|
|
426
|
-
logger.info("Scheduled dependent node",
|
|
427
|
-
node_id=ready_node.node_id,
|
|
428
|
-
triggered_by=node.node_id)
|
|
429
|
-
|
|
430
|
-
# Periodic state save
|
|
431
|
-
await self.cache.save_execution_state(ctx)
|
|
432
|
-
|
|
433
|
-
# Handle workflow failure - cancel remaining tasks
|
|
434
|
-
if workflow_failed and pending_tasks:
|
|
435
|
-
logger.info("Workflow failed, cancelling remaining tasks",
|
|
436
|
-
pending_count=len(pending_tasks))
|
|
437
|
-
|
|
438
|
-
for task in pending_tasks:
|
|
439
|
-
task.cancel()
|
|
440
|
-
|
|
441
|
-
# Wait for cancelled tasks
|
|
442
|
-
if pending_tasks:
|
|
443
|
-
cancelled_done, _ = await asyncio.wait(
|
|
444
|
-
pending_tasks,
|
|
445
|
-
return_when=asyncio.ALL_COMPLETED
|
|
446
|
-
)
|
|
447
|
-
|
|
448
|
-
for task in cancelled_done:
|
|
449
|
-
node = task_to_node.get(task)
|
|
450
|
-
if node and node.status not in (TaskStatus.COMPLETED, TaskStatus.FAILED):
|
|
451
|
-
node.status = TaskStatus.CANCELLED
|
|
452
|
-
node.completed_at = time.time()
|
|
453
|
-
|
|
454
|
-
ctx.status = WorkflowStatus.FAILED
|
|
455
|
-
|
|
456
|
-
# =========================================================================
|
|
457
|
-
# PARALLEL EXECUTION (Legacy - Fork/Join with FIRST_COMPLETED pattern)
|
|
458
|
-
# =========================================================================
|
|
459
|
-
|
|
460
|
-
async def _execute_parallel_nodes(self, ctx: ExecutionContext,
|
|
461
|
-
nodes: List[NodeExecution],
|
|
462
|
-
enable_caching: bool) -> None:
|
|
463
|
-
"""Execute multiple nodes in parallel using asyncio.wait with FIRST_COMPLETED.
|
|
464
|
-
|
|
465
|
-
Uses the standard asyncio pattern for mixed task types:
|
|
466
|
-
- Regular nodes complete quickly
|
|
467
|
-
- Trigger nodes wait indefinitely for external events
|
|
468
|
-
- If a regular node fails, cancel remaining trigger nodes immediately
|
|
469
|
-
|
|
470
|
-
This follows Python asyncio best practices:
|
|
471
|
-
https://docs.python.org/3/library/asyncio-task.html#asyncio.wait
|
|
472
|
-
|
|
473
|
-
Args:
|
|
474
|
-
ctx: ExecutionContext
|
|
475
|
-
nodes: List of NodeExecution to run in parallel
|
|
476
|
-
enable_caching: Whether to use result caching
|
|
477
|
-
"""
|
|
478
|
-
# Mark all as scheduled
|
|
479
|
-
for node in nodes:
|
|
480
|
-
node.status = TaskStatus.SCHEDULED
|
|
481
|
-
await self._notify_status(node.node_id, "scheduled", {})
|
|
482
|
-
|
|
483
|
-
# Create named tasks for parallel execution
|
|
484
|
-
# Using dict to track node -> task mapping for proper result handling
|
|
485
|
-
node_to_task: Dict[str, asyncio.Task] = {}
|
|
486
|
-
task_to_node: Dict[asyncio.Task, NodeExecution] = {}
|
|
487
|
-
|
|
488
|
-
for node in nodes:
|
|
489
|
-
task = asyncio.create_task(
|
|
490
|
-
self._execute_node_with_retry(ctx, node, enable_caching),
|
|
491
|
-
name=f"node_{node.node_id}"
|
|
492
|
-
)
|
|
493
|
-
node_to_task[node.node_id] = task
|
|
494
|
-
task_to_node[task] = node
|
|
495
|
-
|
|
496
|
-
pending: Set[asyncio.Task] = set(node_to_task.values())
|
|
497
|
-
workflow_failed = False
|
|
498
|
-
|
|
499
|
-
# Process tasks as they complete using FIRST_COMPLETED pattern
|
|
500
|
-
while pending:
|
|
501
|
-
# Wait for any task to complete
|
|
502
|
-
done, pending = await asyncio.wait(
|
|
503
|
-
pending,
|
|
504
|
-
return_when=asyncio.FIRST_COMPLETED
|
|
505
|
-
)
|
|
506
|
-
|
|
507
|
-
# Process completed tasks
|
|
508
|
-
for task in done:
|
|
509
|
-
node = task_to_node[task]
|
|
510
|
-
|
|
511
|
-
try:
|
|
512
|
-
result = task.result()
|
|
513
|
-
|
|
514
|
-
if isinstance(result, Exception):
|
|
515
|
-
# Task raised exception
|
|
516
|
-
node.status = TaskStatus.FAILED
|
|
517
|
-
node.error = str(result)
|
|
518
|
-
node.completed_at = time.time()
|
|
519
|
-
ctx.errors.append({
|
|
520
|
-
"node_id": node.node_id,
|
|
521
|
-
"error": str(result),
|
|
522
|
-
"timestamp": time.time(),
|
|
523
|
-
})
|
|
524
|
-
await self._notify_status(node.node_id, "error", {"error": str(result)})
|
|
525
|
-
logger.error("Parallel node failed",
|
|
526
|
-
node_id=node.node_id, error=str(result))
|
|
527
|
-
workflow_failed = True
|
|
528
|
-
|
|
529
|
-
elif result.get("retries_exhausted"):
|
|
530
|
-
# Node failed after all retries - already in DLQ
|
|
531
|
-
node.status = TaskStatus.FAILED
|
|
532
|
-
node.error = result.get("error", "Unknown error")
|
|
533
|
-
node.completed_at = time.time()
|
|
534
|
-
ctx.errors.append({
|
|
535
|
-
"node_id": node.node_id,
|
|
536
|
-
"error": node.error,
|
|
537
|
-
"retries_exhausted": True,
|
|
538
|
-
"timestamp": time.time(),
|
|
539
|
-
})
|
|
540
|
-
workflow_failed = True
|
|
541
|
-
|
|
542
|
-
elif not result.get("success"):
|
|
543
|
-
# Node returned failure without exhausting retries
|
|
544
|
-
node.status = TaskStatus.FAILED
|
|
545
|
-
node.error = result.get("error", "Unknown error")
|
|
546
|
-
node.completed_at = time.time()
|
|
547
|
-
ctx.errors.append({
|
|
548
|
-
"node_id": node.node_id,
|
|
549
|
-
"error": node.error,
|
|
550
|
-
"timestamp": time.time(),
|
|
551
|
-
})
|
|
552
|
-
await self._notify_status(node.node_id, "error", {"error": node.error})
|
|
553
|
-
logger.error("Parallel node failed",
|
|
554
|
-
node_id=node.node_id, error=node.error)
|
|
555
|
-
workflow_failed = True
|
|
556
|
-
|
|
557
|
-
except asyncio.CancelledError:
|
|
558
|
-
# Task was cancelled (by us or externally)
|
|
559
|
-
node.status = TaskStatus.CANCELLED
|
|
560
|
-
node.completed_at = time.time()
|
|
561
|
-
logger.info("Parallel node cancelled", node_id=node.node_id)
|
|
562
|
-
|
|
563
|
-
except Exception as e:
|
|
564
|
-
# Unexpected exception from task.result()
|
|
565
|
-
node.status = TaskStatus.FAILED
|
|
566
|
-
node.error = str(e)
|
|
567
|
-
node.completed_at = time.time()
|
|
568
|
-
ctx.errors.append({
|
|
569
|
-
"node_id": node.node_id,
|
|
570
|
-
"error": str(e),
|
|
571
|
-
"timestamp": time.time(),
|
|
572
|
-
})
|
|
573
|
-
await self._notify_status(node.node_id, "error", {"error": str(e)})
|
|
574
|
-
logger.error("Parallel node exception",
|
|
575
|
-
node_id=node.node_id, error=str(e))
|
|
576
|
-
workflow_failed = True
|
|
577
|
-
|
|
578
|
-
# If workflow failed, cancel remaining pending tasks
|
|
579
|
-
# This prevents trigger nodes from blocking forever when a regular node fails
|
|
580
|
-
if workflow_failed and pending:
|
|
581
|
-
logger.info("Workflow failed, cancelling remaining tasks",
|
|
582
|
-
pending_count=len(pending))
|
|
583
|
-
|
|
584
|
-
for task in pending:
|
|
585
|
-
task.cancel()
|
|
586
|
-
|
|
587
|
-
# Wait for cancelled tasks to finish
|
|
588
|
-
if pending:
|
|
589
|
-
cancelled_done, _ = await asyncio.wait(
|
|
590
|
-
pending,
|
|
591
|
-
return_when=asyncio.ALL_COMPLETED
|
|
592
|
-
)
|
|
593
|
-
|
|
594
|
-
# Mark cancelled nodes
|
|
595
|
-
for task in cancelled_done:
|
|
596
|
-
node = task_to_node[task]
|
|
597
|
-
if node.status not in (TaskStatus.COMPLETED, TaskStatus.FAILED):
|
|
598
|
-
node.status = TaskStatus.CANCELLED
|
|
599
|
-
node.completed_at = time.time()
|
|
600
|
-
logger.info("Cancelled pending node", node_id=node.node_id)
|
|
601
|
-
|
|
602
|
-
pending = set() # All done now
|
|
603
|
-
|
|
604
|
-
# Mark workflow as failed if any node failed
|
|
605
|
-
if workflow_failed:
|
|
606
|
-
ctx.status = WorkflowStatus.FAILED
|
|
607
|
-
|
|
608
|
-
async def _execute_single_node(self, ctx: ExecutionContext,
|
|
609
|
-
node: NodeExecution,
|
|
610
|
-
enable_caching: bool) -> None:
|
|
611
|
-
"""Execute a single node with retry logic.
|
|
612
|
-
|
|
613
|
-
Args:
|
|
614
|
-
ctx: ExecutionContext
|
|
615
|
-
node: NodeExecution to run
|
|
616
|
-
enable_caching: Whether to use result caching
|
|
617
|
-
"""
|
|
618
|
-
node.status = TaskStatus.SCHEDULED
|
|
619
|
-
await self._notify_status(node.node_id, "scheduled", {})
|
|
620
|
-
|
|
621
|
-
try:
|
|
622
|
-
result = await self._execute_node_with_retry(ctx, node, enable_caching)
|
|
623
|
-
|
|
624
|
-
if result.get("retries_exhausted"):
|
|
625
|
-
# Node failed after all retries - already in DLQ
|
|
626
|
-
node.status = TaskStatus.FAILED
|
|
627
|
-
node.error = result.get("error", "Unknown error")
|
|
628
|
-
node.completed_at = time.time()
|
|
629
|
-
ctx.errors.append({
|
|
630
|
-
"node_id": node.node_id,
|
|
631
|
-
"error": node.error,
|
|
632
|
-
"retries_exhausted": True,
|
|
633
|
-
"timestamp": time.time(),
|
|
634
|
-
})
|
|
635
|
-
ctx.status = WorkflowStatus.FAILED
|
|
636
|
-
|
|
637
|
-
except Exception as e:
|
|
638
|
-
node.status = TaskStatus.FAILED
|
|
639
|
-
node.error = str(e)
|
|
640
|
-
node.completed_at = time.time()
|
|
641
|
-
ctx.errors.append({
|
|
642
|
-
"node_id": node.node_id,
|
|
643
|
-
"error": str(e),
|
|
644
|
-
"timestamp": time.time(),
|
|
645
|
-
})
|
|
646
|
-
await self._notify_status(node.node_id, "error", {"error": str(e)})
|
|
647
|
-
ctx.status = WorkflowStatus.FAILED
|
|
648
|
-
|
|
649
|
-
# =========================================================================
|
|
650
|
-
# RETRY LOGIC
|
|
651
|
-
# =========================================================================
|
|
652
|
-
|
|
653
|
-
async def _execute_node_with_retry(self, ctx: ExecutionContext,
|
|
654
|
-
node: NodeExecution,
|
|
655
|
-
enable_caching: bool) -> Dict[str, Any]:
|
|
656
|
-
"""Execute node with retry logic and DLQ on final failure.
|
|
657
|
-
|
|
658
|
-
Uses exponential backoff retry policy based on node type.
|
|
659
|
-
On exhausted retries, adds entry to Dead Letter Queue.
|
|
660
|
-
|
|
661
|
-
Args:
|
|
662
|
-
ctx: ExecutionContext
|
|
663
|
-
node: NodeExecution to run
|
|
664
|
-
enable_caching: Whether to use result caching
|
|
665
|
-
|
|
666
|
-
Returns:
|
|
667
|
-
Execution result
|
|
668
|
-
"""
|
|
669
|
-
# Get retry policy for this node type
|
|
670
|
-
node_data = self._get_node_data(ctx, node.node_id)
|
|
671
|
-
custom_policy = node_data.get("parameters", {}).get("retryPolicy")
|
|
672
|
-
retry_policy = get_retry_policy(node.node_type, custom_policy)
|
|
673
|
-
|
|
674
|
-
last_error = None
|
|
675
|
-
inputs = self._gather_node_inputs(ctx, node.node_id)
|
|
676
|
-
|
|
677
|
-
for attempt in range(retry_policy.max_attempts):
|
|
678
|
-
try:
|
|
679
|
-
node.retry_count = attempt
|
|
680
|
-
result = await self._execute_node_with_caching(ctx, node, enable_caching)
|
|
681
|
-
|
|
682
|
-
# Success - return result
|
|
683
|
-
if result.get("success"):
|
|
684
|
-
return result
|
|
685
|
-
|
|
686
|
-
# Execution returned failure (not exception)
|
|
687
|
-
error = result.get("error", "Unknown error")
|
|
688
|
-
last_error = error
|
|
689
|
-
|
|
690
|
-
# Check if we should retry
|
|
691
|
-
if retry_policy.should_retry(error, attempt + 1):
|
|
692
|
-
delay = retry_policy.calculate_delay(attempt)
|
|
693
|
-
logger.info("Retrying node after failure",
|
|
694
|
-
node_id=node.node_id,
|
|
695
|
-
attempt=attempt + 1,
|
|
696
|
-
max_attempts=retry_policy.max_attempts,
|
|
697
|
-
delay=delay,
|
|
698
|
-
error=error[:100])
|
|
699
|
-
|
|
700
|
-
await self._notify_status(node.node_id, "retrying", {
|
|
701
|
-
"attempt": attempt + 1,
|
|
702
|
-
"max_attempts": retry_policy.max_attempts,
|
|
703
|
-
"delay": delay,
|
|
704
|
-
"error": error,
|
|
705
|
-
})
|
|
706
|
-
|
|
707
|
-
await asyncio.sleep(delay)
|
|
708
|
-
|
|
709
|
-
# Reset node status for retry
|
|
710
|
-
node.status = TaskStatus.PENDING
|
|
711
|
-
node.error = None
|
|
712
|
-
continue
|
|
713
|
-
else:
|
|
714
|
-
# Not retryable, break out
|
|
715
|
-
break
|
|
716
|
-
|
|
717
|
-
except asyncio.CancelledError:
|
|
718
|
-
raise # Propagate cancellation
|
|
719
|
-
except Exception as e:
|
|
720
|
-
last_error = str(e)
|
|
721
|
-
logger.warning("Node execution exception",
|
|
722
|
-
node_id=node.node_id,
|
|
723
|
-
attempt=attempt + 1,
|
|
724
|
-
error=last_error)
|
|
725
|
-
|
|
726
|
-
# Check if we should retry
|
|
727
|
-
if retry_policy.should_retry(last_error, attempt + 1):
|
|
728
|
-
delay = retry_policy.calculate_delay(attempt)
|
|
729
|
-
logger.info("Retrying node after exception",
|
|
730
|
-
node_id=node.node_id,
|
|
731
|
-
attempt=attempt + 1,
|
|
732
|
-
delay=delay)
|
|
733
|
-
|
|
734
|
-
await asyncio.sleep(delay)
|
|
735
|
-
node.status = TaskStatus.PENDING
|
|
736
|
-
node.error = None
|
|
737
|
-
continue
|
|
738
|
-
else:
|
|
739
|
-
break
|
|
740
|
-
|
|
741
|
-
# All retries exhausted - add to DLQ (handler is no-op if disabled)
|
|
742
|
-
await self.dlq.add_failed_node(ctx, node, inputs, last_error or "Unknown error")
|
|
743
|
-
|
|
744
|
-
# Return failure result
|
|
745
|
-
return {
|
|
746
|
-
"success": False,
|
|
747
|
-
"error": last_error or "Unknown error",
|
|
748
|
-
"retries_exhausted": True,
|
|
749
|
-
"retry_count": node.retry_count,
|
|
750
|
-
}
|
|
751
|
-
|
|
752
|
-
# =========================================================================
|
|
753
|
-
# CACHED NODE EXECUTION (Prefect pattern)
|
|
754
|
-
# =========================================================================
|
|
755
|
-
|
|
756
|
-
async def _execute_node_with_caching(self, ctx: ExecutionContext,
|
|
757
|
-
node: NodeExecution,
|
|
758
|
-
enable_caching: bool) -> Dict[str, Any]:
|
|
759
|
-
"""Execute node with result caching (Prefect pattern).
|
|
760
|
-
|
|
761
|
-
Args:
|
|
762
|
-
ctx: ExecutionContext
|
|
763
|
-
node: NodeExecution to run
|
|
764
|
-
enable_caching: Whether to check cache
|
|
765
|
-
|
|
766
|
-
Returns:
|
|
767
|
-
Execution result
|
|
768
|
-
"""
|
|
769
|
-
# Get node parameters and inputs
|
|
770
|
-
node_data = self._get_node_data(ctx, node.node_id)
|
|
771
|
-
inputs = self._gather_node_inputs(ctx, node.node_id)
|
|
772
|
-
|
|
773
|
-
# Check cache first (Prefect pattern)
|
|
774
|
-
if enable_caching:
|
|
775
|
-
cached_result = await self.cache.get_cached_result(
|
|
776
|
-
ctx.execution_id, node.node_id, inputs
|
|
777
|
-
)
|
|
778
|
-
if cached_result:
|
|
779
|
-
logger.info("Cache hit", node_id=node.node_id)
|
|
780
|
-
node.status = TaskStatus.CACHED
|
|
781
|
-
node.output = cached_result
|
|
782
|
-
node.input_hash = hash_inputs(inputs)
|
|
783
|
-
node.completed_at = time.time()
|
|
784
|
-
ctx.outputs[node.node_id] = cached_result
|
|
785
|
-
await self._notify_status(node.node_id, "success",
|
|
786
|
-
{"cached": True, **cached_result})
|
|
787
|
-
await self.cache.add_event(ctx.execution_id, "node_cached", {
|
|
788
|
-
"node_id": node.node_id,
|
|
789
|
-
})
|
|
790
|
-
return cached_result
|
|
791
|
-
|
|
792
|
-
# Execute node
|
|
793
|
-
node.status = TaskStatus.RUNNING
|
|
794
|
-
node.started_at = time.time()
|
|
795
|
-
node.input_hash = hash_inputs(inputs)
|
|
796
|
-
await self._notify_status(node.node_id, "executing", {})
|
|
797
|
-
await self.cache.add_event(ctx.execution_id, "node_started", {
|
|
798
|
-
"node_id": node.node_id,
|
|
799
|
-
"node_type": node.node_type,
|
|
800
|
-
})
|
|
801
|
-
|
|
802
|
-
# Update heartbeat (for crash detection)
|
|
803
|
-
await self.cache.update_heartbeat(ctx.execution_id, node.node_id)
|
|
804
|
-
|
|
805
|
-
# Build execution context for node handler
|
|
806
|
-
# workflow_id is included for per-workflow status scoping (n8n pattern)
|
|
807
|
-
exec_context = {
|
|
808
|
-
"nodes": ctx.nodes,
|
|
809
|
-
"edges": ctx.edges,
|
|
810
|
-
"session_id": ctx.session_id,
|
|
811
|
-
"execution_id": ctx.execution_id,
|
|
812
|
-
"workflow_id": ctx.workflow_id, # For per-workflow status broadcasts
|
|
813
|
-
"start_time": node.started_at,
|
|
814
|
-
"outputs": ctx.outputs, # Previous node outputs
|
|
815
|
-
}
|
|
816
|
-
|
|
817
|
-
# Call the actual node executor
|
|
818
|
-
result = await self.node_executor(
|
|
819
|
-
node.node_id,
|
|
820
|
-
node.node_type,
|
|
821
|
-
node_data.get("parameters", {}),
|
|
822
|
-
exec_context
|
|
823
|
-
)
|
|
824
|
-
|
|
825
|
-
# Process result
|
|
826
|
-
if result.get("success"):
|
|
827
|
-
node.status = TaskStatus.COMPLETED
|
|
828
|
-
node.output = result.get("result", {})
|
|
829
|
-
node.completed_at = time.time()
|
|
830
|
-
ctx.outputs[node.node_id] = node.output
|
|
831
|
-
|
|
832
|
-
# Cache result (Prefect pattern)
|
|
833
|
-
if enable_caching:
|
|
834
|
-
await self.cache.set_cached_result(
|
|
835
|
-
ctx.execution_id, node.node_id, inputs, node.output
|
|
836
|
-
)
|
|
837
|
-
|
|
838
|
-
await self._notify_status(node.node_id, "success", node.output)
|
|
839
|
-
await self.cache.add_event(ctx.execution_id, "node_completed", {
|
|
840
|
-
"node_id": node.node_id,
|
|
841
|
-
"execution_time": node.completed_at - node.started_at,
|
|
842
|
-
})
|
|
843
|
-
else:
|
|
844
|
-
node.status = TaskStatus.FAILED
|
|
845
|
-
node.error = result.get("error", "Unknown error")
|
|
846
|
-
node.completed_at = time.time()
|
|
847
|
-
ctx.errors.append({
|
|
848
|
-
"node_id": node.node_id,
|
|
849
|
-
"error": node.error,
|
|
850
|
-
"timestamp": time.time(),
|
|
851
|
-
})
|
|
852
|
-
|
|
853
|
-
await self._notify_status(node.node_id, "error", {"error": node.error})
|
|
854
|
-
await self.cache.add_event(ctx.execution_id, "node_failed", {
|
|
855
|
-
"node_id": node.node_id,
|
|
856
|
-
"error": node.error,
|
|
857
|
-
})
|
|
858
|
-
|
|
859
|
-
# Mark workflow as failed
|
|
860
|
-
ctx.status = WorkflowStatus.FAILED
|
|
861
|
-
|
|
862
|
-
return result
|
|
863
|
-
|
|
864
|
-
# =========================================================================
|
|
865
|
-
# DAG ANALYSIS
|
|
866
|
-
# =========================================================================
|
|
867
|
-
|
|
868
|
-
def _compute_execution_layers(self, nodes: List[Dict],
|
|
869
|
-
edges: List[Dict]) -> List[List[str]]:
|
|
870
|
-
"""Compute execution layers for parallel execution.
|
|
871
|
-
|
|
872
|
-
Nodes in the same layer have no dependencies on each other
|
|
873
|
-
and can execute in parallel. Layer 0 contains trigger nodes
|
|
874
|
-
(workflow starting points with no input handles).
|
|
875
|
-
|
|
876
|
-
Following n8n pattern: Trigger nodes are the starting point of every
|
|
877
|
-
workflow. They listen for specific events/conditions and initiate
|
|
878
|
-
the execution of the entire workflow.
|
|
879
|
-
|
|
880
|
-
Config nodes and toolkit sub-nodes are excluded from layers since
|
|
881
|
-
they don't execute as independent workflow nodes.
|
|
882
|
-
|
|
883
|
-
Args:
|
|
884
|
-
nodes: List of workflow nodes
|
|
885
|
-
edges: List of edges
|
|
886
|
-
|
|
887
|
-
Returns:
|
|
888
|
-
List of layers, where each layer is a list of node IDs
|
|
889
|
-
"""
|
|
890
|
-
from constants import CONFIG_NODE_TYPES, TOOLKIT_NODE_TYPES
|
|
891
|
-
|
|
892
|
-
# Build node type lookup for trigger detection
|
|
893
|
-
node_types: Dict[str, str] = {
|
|
894
|
-
node["id"]: node.get("type", "unknown") for node in nodes
|
|
895
|
-
}
|
|
896
|
-
|
|
897
|
-
# Find toolkit sub-nodes (nodes that connect TO a toolkit)
|
|
898
|
-
toolkit_node_ids = {n.get("id") for n in nodes if n.get("type") in TOOLKIT_NODE_TYPES}
|
|
899
|
-
|
|
900
|
-
# Find AI Agent nodes (both aiAgent and chatAgent have config handles)
|
|
901
|
-
ai_agent_node_ids = {n.get("id") for n in nodes if n.get("type") in ('aiAgent', 'chatAgent')}
|
|
902
|
-
|
|
903
|
-
subnode_ids: set = set()
|
|
904
|
-
for edge in edges:
|
|
905
|
-
source = edge.get("source")
|
|
906
|
-
target = edge.get("target")
|
|
907
|
-
target_handle = edge.get("targetHandle")
|
|
908
|
-
|
|
909
|
-
# Any node that connects TO a toolkit is a sub-node
|
|
910
|
-
if target in toolkit_node_ids and source:
|
|
911
|
-
subnode_ids.add(source)
|
|
912
|
-
|
|
913
|
-
# Nodes connected to AI Agent/
|
|
914
|
-
# These handles: input-memory, input-tools, input-skill
|
|
915
|
-
if target in ai_agent_node_ids and source and target_handle:
|
|
916
|
-
if target_handle in ('input-memory', 'input-tools', 'input-skill'):
|
|
917
|
-
subnode_ids.add(source)
|
|
918
|
-
|
|
919
|
-
# Filter out config nodes and sub-nodes from execution
|
|
920
|
-
excluded_ids = set()
|
|
921
|
-
for node in nodes:
|
|
922
|
-
node_id = node.get("id")
|
|
923
|
-
node_type = node.get("type", "unknown")
|
|
924
|
-
if node_type in CONFIG_NODE_TYPES or node_id in subnode_ids:
|
|
925
|
-
excluded_ids.add(node_id)
|
|
926
|
-
|
|
927
|
-
# Build adjacency and in-degree maps (excluding filtered nodes)
|
|
928
|
-
in_degree: Dict[str, int] = defaultdict(int)
|
|
929
|
-
adjacency: Dict[str, List[str]] = defaultdict(list)
|
|
930
|
-
node_ids = {node["id"] for node in nodes if node["id"] not in excluded_ids}
|
|
931
|
-
|
|
932
|
-
for edge in edges:
|
|
933
|
-
source = edge.get("source")
|
|
934
|
-
target = edge.get("target")
|
|
935
|
-
if source in node_ids and target in node_ids:
|
|
936
|
-
adjacency[source].append(target)
|
|
937
|
-
in_degree[target] += 1
|
|
938
|
-
|
|
939
|
-
# Initialize in-degree for all nodes
|
|
940
|
-
for node_id in node_ids:
|
|
941
|
-
if node_id not in in_degree:
|
|
942
|
-
in_degree[node_id] = 0
|
|
943
|
-
|
|
944
|
-
# Kahn's algorithm for topological sort with layers
|
|
945
|
-
layers = []
|
|
946
|
-
remaining = set(node_ids)
|
|
947
|
-
is_first_layer = True
|
|
948
|
-
|
|
949
|
-
while remaining:
|
|
950
|
-
# Find all nodes with in-degree 0 (no dependencies)
|
|
951
|
-
layer = [
|
|
952
|
-
node_id for node_id in remaining
|
|
953
|
-
if in_degree[node_id] == 0
|
|
954
|
-
]
|
|
955
|
-
|
|
956
|
-
if not layer:
|
|
957
|
-
# Cycle detected or stuck
|
|
958
|
-
logger.warning("Cycle detected or no start nodes",
|
|
959
|
-
remaining=list(remaining))
|
|
960
|
-
# Add remaining as single layer to avoid infinite loop
|
|
961
|
-
layers.append(list(remaining))
|
|
962
|
-
break
|
|
963
|
-
|
|
964
|
-
# For layer 0, validate that starting nodes are trigger nodes
|
|
965
|
-
if is_first_layer:
|
|
966
|
-
trigger_nodes = []
|
|
967
|
-
non_trigger_nodes = []
|
|
968
|
-
|
|
969
|
-
for node_id in layer:
|
|
970
|
-
node_type = node_types.get(node_id, "unknown")
|
|
971
|
-
if is_trigger_node(node_type):
|
|
972
|
-
trigger_nodes.append(node_id)
|
|
973
|
-
else:
|
|
974
|
-
non_trigger_nodes.append(node_id)
|
|
975
|
-
logger.warning(
|
|
976
|
-
"Non-trigger node found at graph entry point",
|
|
977
|
-
node_id=node_id,
|
|
978
|
-
node_type=node_type,
|
|
979
|
-
expected_types=list(WORKFLOW_TRIGGER_TYPES)
|
|
980
|
-
)
|
|
981
|
-
|
|
982
|
-
# Log trigger node identification
|
|
983
|
-
if trigger_nodes:
|
|
984
|
-
logger.info(
|
|
985
|
-
"Identified trigger nodes as workflow starting points",
|
|
986
|
-
trigger_count=len(trigger_nodes),
|
|
987
|
-
trigger_nodes=[
|
|
988
|
-
f"{nid[:8]}({node_types.get(nid)})"
|
|
989
|
-
for nid in trigger_nodes
|
|
990
|
-
]
|
|
991
|
-
)
|
|
992
|
-
|
|
993
|
-
is_first_layer = False
|
|
994
|
-
|
|
995
|
-
layers.append(layer)
|
|
996
|
-
|
|
997
|
-
# Remove layer nodes and update in-degrees
|
|
998
|
-
for node_id in layer:
|
|
999
|
-
remaining.remove(node_id)
|
|
1000
|
-
for successor in adjacency[node_id]:
|
|
1001
|
-
in_degree[successor] -= 1
|
|
1002
|
-
|
|
1003
|
-
logger.debug("Computed execution layers",
|
|
1004
|
-
layer_count=len(layers),
|
|
1005
|
-
layers=[[n[:8] for n in l] for l in layers])
|
|
1006
|
-
|
|
1007
|
-
return layers
|
|
1008
|
-
|
|
1009
|
-
def _find_ready_nodes(self, ctx: ExecutionContext) -> List[NodeExecution]:
|
|
1010
|
-
"""Find nodes ready to execute (dependencies satisfied + conditions met).
|
|
1011
|
-
|
|
1012
|
-
A node is ready if:
|
|
1013
|
-
- Status is PENDING
|
|
1014
|
-
- Not disabled (n8n-style disable feature)
|
|
1015
|
-
- All upstream nodes are COMPLETED, CACHED, or SKIPPED
|
|
1016
|
-
- Edge conditions (if any) evaluate to True based on upstream outputs
|
|
1017
|
-
|
|
1018
|
-
Supports runtime conditional branching (Prefect-style dynamic workflows).
|
|
1019
|
-
|
|
1020
|
-
Args:
|
|
1021
|
-
ctx: ExecutionContext
|
|
1022
|
-
|
|
1023
|
-
Returns:
|
|
1024
|
-
List of NodeExecution ready to run
|
|
1025
|
-
"""
|
|
1026
|
-
from constants import CONFIG_NODE_TYPES
|
|
1027
|
-
|
|
1028
|
-
# Build set of completed nodes
|
|
1029
|
-
completed = set(ctx.get_completed_nodes())
|
|
1030
|
-
|
|
1031
|
-
# Build map of node_id -> node_type for config node detection
|
|
1032
|
-
node_types: Dict[str, str] = {}
|
|
1033
|
-
for node in ctx.nodes:
|
|
1034
|
-
node_types[node.get("id", "")] = node.get("type", "unknown")
|
|
1035
|
-
|
|
1036
|
-
# Build dependency map and track conditional edges
|
|
1037
|
-
# Skip edges from config nodes (they don't execute, provide config only)
|
|
1038
|
-
dependencies: Dict[str, Set[str]] = defaultdict(set)
|
|
1039
|
-
conditional_edges: Dict[str, List[Dict]] = defaultdict(list) # target -> edges with conditions
|
|
1040
|
-
|
|
1041
|
-
for edge in ctx.edges:
|
|
1042
|
-
target = edge.get("target")
|
|
1043
|
-
source = edge.get("source")
|
|
1044
|
-
if target and source:
|
|
1045
|
-
# Skip edges from config nodes - they provide configuration, not execution dependencies
|
|
1046
|
-
source_type = node_types.get(source, "unknown")
|
|
1047
|
-
if source_type in CONFIG_NODE_TYPES:
|
|
1048
|
-
continue
|
|
1049
|
-
|
|
1050
|
-
dependencies[target].add(source)
|
|
1051
|
-
# Track edges with conditions for evaluation
|
|
1052
|
-
if edge.get("data", {}).get("condition"):
|
|
1053
|
-
conditional_edges[target].append(edge)
|
|
1054
|
-
|
|
1055
|
-
# Find ready nodes
|
|
1056
|
-
ready = []
|
|
1057
|
-
for node_id, node_exec in ctx.node_executions.items():
|
|
1058
|
-
if node_exec.status != TaskStatus.PENDING:
|
|
1059
|
-
continue
|
|
1060
|
-
|
|
1061
|
-
# Check if all dependencies are satisfied
|
|
1062
|
-
deps = dependencies.get(node_id, set())
|
|
1063
|
-
if not deps <= completed: # Not all deps completed
|
|
1064
|
-
continue
|
|
1065
|
-
|
|
1066
|
-
# Check if node is disabled (n8n-style disable)
|
|
1067
|
-
node_data = self._get_node_data(ctx, node_id)
|
|
1068
|
-
if node_data.get("data", {}).get("disabled"):
|
|
1069
|
-
node_exec.status = TaskStatus.SKIPPED
|
|
1070
|
-
node_exec.completed_at = time.time()
|
|
1071
|
-
logger.debug("Skipping disabled node", node_id=node_id)
|
|
1072
|
-
# Notify status callback about skipped node
|
|
1073
|
-
asyncio.create_task(self._notify_status(node_id, "skipped", {"disabled": True}))
|
|
1074
|
-
continue
|
|
1075
|
-
|
|
1076
|
-
# Check conditional edges for this node
|
|
1077
|
-
if node_id in conditional_edges:
|
|
1078
|
-
# Has conditional incoming edges - evaluate them
|
|
1079
|
-
conditions_met = self._evaluate_incoming_conditions(
|
|
1080
|
-
ctx, node_id, conditional_edges[node_id]
|
|
1081
|
-
)
|
|
1082
|
-
if not conditions_met:
|
|
1083
|
-
# Mark as SKIPPED if conditions not met and all deps done
|
|
1084
|
-
node_exec.status = TaskStatus.SKIPPED
|
|
1085
|
-
logger.info("Node skipped due to unmet conditions",
|
|
1086
|
-
node_id=node_id)
|
|
1087
|
-
continue
|
|
1088
|
-
|
|
1089
|
-
ready.append(node_exec)
|
|
1090
|
-
|
|
1091
|
-
return ready
|
|
1092
|
-
|
|
1093
|
-
def _evaluate_incoming_conditions(self, ctx: ExecutionContext, target_node_id: str,
|
|
1094
|
-
edges: List[Dict]) -> bool:
|
|
1095
|
-
"""Evaluate conditions on incoming edges to determine if node should run.
|
|
1096
|
-
|
|
1097
|
-
Args:
|
|
1098
|
-
ctx: ExecutionContext
|
|
1099
|
-
target_node_id: The node we're checking
|
|
1100
|
-
edges: Incoming edges with conditions
|
|
1101
|
-
|
|
1102
|
-
Returns:
|
|
1103
|
-
True if at least one conditional edge evaluates to True
|
|
1104
|
-
"""
|
|
1105
|
-
for edge in edges:
|
|
1106
|
-
source_id = edge.get("source")
|
|
1107
|
-
condition = edge.get("data", {}).get("condition")
|
|
1108
|
-
|
|
1109
|
-
if not condition:
|
|
1110
|
-
continue
|
|
1111
|
-
|
|
1112
|
-
# Get output from source node
|
|
1113
|
-
source_output = ctx.outputs.get(source_id, {})
|
|
1114
|
-
|
|
1115
|
-
# Evaluate condition
|
|
1116
|
-
if evaluate_condition(condition, source_output):
|
|
1117
|
-
logger.debug("Conditional edge matched",
|
|
1118
|
-
source=source_id,
|
|
1119
|
-
target=target_node_id,
|
|
1120
|
-
condition=condition)
|
|
1121
|
-
return True
|
|
1122
|
-
|
|
1123
|
-
# No conditions matched
|
|
1124
|
-
logger.debug("No conditional edges matched",
|
|
1125
|
-
target=target_node_id,
|
|
1126
|
-
edge_count=len(edges))
|
|
1127
|
-
return False
|
|
1128
|
-
|
|
1129
|
-
def _get_node_data(self, ctx: ExecutionContext, node_id: str) -> Dict[str, Any]:
|
|
1130
|
-
"""Get node data from context.
|
|
1131
|
-
|
|
1132
|
-
Args:
|
|
1133
|
-
ctx: ExecutionContext
|
|
1134
|
-
node_id: Node ID
|
|
1135
|
-
|
|
1136
|
-
Returns:
|
|
1137
|
-
Node data dict
|
|
1138
|
-
"""
|
|
1139
|
-
for node in ctx.nodes:
|
|
1140
|
-
if node.get("id") == node_id:
|
|
1141
|
-
return node
|
|
1142
|
-
return {}
|
|
1143
|
-
|
|
1144
|
-
def _gather_node_inputs(self, ctx: ExecutionContext, node_id: str) -> Dict[str, Any]:
|
|
1145
|
-
"""Gather inputs for a node from upstream outputs.
|
|
1146
|
-
|
|
1147
|
-
Args:
|
|
1148
|
-
ctx: ExecutionContext
|
|
1149
|
-
node_id: Target node ID
|
|
1150
|
-
|
|
1151
|
-
Returns:
|
|
1152
|
-
Dict of upstream outputs keyed by source node type
|
|
1153
|
-
"""
|
|
1154
|
-
inputs = {}
|
|
1155
|
-
for edge in ctx.edges:
|
|
1156
|
-
if edge.get("target") == node_id:
|
|
1157
|
-
source_id = edge.get("source")
|
|
1158
|
-
if source_id in ctx.outputs:
|
|
1159
|
-
# Find source node type
|
|
1160
|
-
source_node = self._get_node_data(ctx, source_id)
|
|
1161
|
-
source_type = source_node.get("type", source_id)
|
|
1162
|
-
inputs[source_type] = ctx.outputs[source_id]
|
|
1163
|
-
return inputs
|
|
1164
|
-
|
|
1165
|
-
# =========================================================================
|
|
1166
|
-
# STATUS NOTIFICATIONS
|
|
1167
|
-
# =========================================================================
|
|
1168
|
-
|
|
1169
|
-
async def _notify_status(self, node_id: str, status: str,
|
|
1170
|
-
data: Dict[str, Any]) -> None:
|
|
1171
|
-
"""Send status notification via callback.
|
|
1172
|
-
|
|
1173
|
-
Args:
|
|
1174
|
-
node_id: Node ID
|
|
1175
|
-
status: Status string
|
|
1176
|
-
data: Additional data
|
|
1177
|
-
"""
|
|
1178
|
-
if self.status_callback:
|
|
1179
|
-
try:
|
|
1180
|
-
await self.status_callback(node_id, status, data)
|
|
1181
|
-
except Exception as e:
|
|
1182
|
-
logger.warning("Status callback failed", node_id=node_id, error=str(e))
|
|
1183
|
-
|
|
1184
|
-
# =========================================================================
|
|
1185
|
-
# RECOVERY
|
|
1186
|
-
# =========================================================================
|
|
1187
|
-
|
|
1188
|
-
async def recover_execution(self, execution_id: str,
|
|
1189
|
-
nodes: List[Dict],
|
|
1190
|
-
edges: List[Dict]) -> Optional[Dict[str, Any]]:
|
|
1191
|
-
"""Recover and resume an interrupted execution.
|
|
1192
|
-
|
|
1193
|
-
Args:
|
|
1194
|
-
execution_id: Execution ID to recover
|
|
1195
|
-
nodes: Workflow nodes
|
|
1196
|
-
edges: Workflow edges
|
|
1197
|
-
|
|
1198
|
-
Returns:
|
|
1199
|
-
Execution result if resumed, None if not found
|
|
1200
|
-
"""
|
|
1201
|
-
ctx = await self.cache.load_execution_state(execution_id, nodes, edges)
|
|
1202
|
-
if not ctx:
|
|
1203
|
-
logger.warning("Execution not found for recovery", execution_id=execution_id)
|
|
1204
|
-
return None
|
|
1205
|
-
|
|
1206
|
-
if ctx.status != WorkflowStatus.RUNNING:
|
|
1207
|
-
logger.info("Execution already complete", execution_id=execution_id,
|
|
1208
|
-
status=ctx.status.value)
|
|
1209
|
-
return {
|
|
1210
|
-
"success": ctx.status == WorkflowStatus.COMPLETED,
|
|
1211
|
-
"execution_id": execution_id,
|
|
1212
|
-
"status": ctx.status.value,
|
|
1213
|
-
"recovered": False,
|
|
1214
|
-
}
|
|
1215
|
-
|
|
1216
|
-
logger.info("Recovering execution",
|
|
1217
|
-
execution_id=execution_id,
|
|
1218
|
-
checkpoints=ctx.checkpoints)
|
|
1219
|
-
|
|
1220
|
-
# Reset any RUNNING nodes to PENDING (they were interrupted)
|
|
1221
|
-
for node_exec in ctx.node_executions.values():
|
|
1222
|
-
if node_exec.status == TaskStatus.RUNNING:
|
|
1223
|
-
node_exec.status = TaskStatus.PENDING
|
|
1224
|
-
node_exec.started_at = None
|
|
1225
|
-
|
|
1226
|
-
# Track in memory
|
|
1227
|
-
self._active_contexts[ctx.execution_id] = ctx
|
|
1228
|
-
|
|
1229
|
-
# Resume decide loop
|
|
1230
|
-
try:
|
|
1231
|
-
await self._workflow_decide(ctx, enable_caching=True)
|
|
1232
|
-
|
|
1233
|
-
if ctx.all_nodes_complete():
|
|
1234
|
-
ctx.status = WorkflowStatus.COMPLETED
|
|
1235
|
-
elif ctx.errors:
|
|
1236
|
-
ctx.status = WorkflowStatus.FAILED
|
|
1237
|
-
|
|
1238
|
-
ctx.completed_at = time.time()
|
|
1239
|
-
await self.cache.save_execution_state(ctx)
|
|
1240
|
-
|
|
1241
|
-
return {
|
|
1242
|
-
"success": ctx.status == WorkflowStatus.COMPLETED,
|
|
1243
|
-
"execution_id": ctx.execution_id,
|
|
1244
|
-
"status": ctx.status.value,
|
|
1245
|
-
"recovered": True,
|
|
1246
|
-
"outputs": ctx.outputs,
|
|
1247
|
-
}
|
|
1248
|
-
|
|
1249
|
-
finally:
|
|
1250
|
-
self._active_contexts.pop(ctx.execution_id, None)
|
|
1251
|
-
|
|
1252
|
-
async def get_active_executions(self) -> List[str]:
|
|
1253
|
-
"""Get list of active execution IDs.
|
|
1254
|
-
|
|
1255
|
-
Returns:
|
|
1256
|
-
List of execution IDs currently running
|
|
1257
|
-
"""
|
|
1258
|
-
return list(self._active_contexts.keys())
|
|
1259
|
-
|
|
1260
|
-
# =========================================================================
|
|
1261
|
-
# DLQ REPLAY
|
|
1262
|
-
# =========================================================================
|
|
1263
|
-
|
|
1264
|
-
async def replay_dlq_entry(self, entry_id: str,
|
|
1265
|
-
nodes: List[Dict],
|
|
1266
|
-
edges: List[Dict]) -> Dict[str, Any]:
|
|
1267
|
-
"""Replay a failed node from the Dead Letter Queue.
|
|
1268
|
-
|
|
1269
|
-
Creates a new execution context and attempts to re-execute the failed node.
|
|
1270
|
-
|
|
1271
|
-
Args:
|
|
1272
|
-
entry_id: DLQ entry ID to replay
|
|
1273
|
-
nodes: Workflow nodes
|
|
1274
|
-
edges: Workflow edges
|
|
1275
|
-
|
|
1276
|
-
Returns:
|
|
1277
|
-
Execution result dict
|
|
1278
|
-
"""
|
|
1279
|
-
# Get DLQ entry
|
|
1280
|
-
entry = await self.cache.get_dlq_entry(entry_id)
|
|
1281
|
-
if not entry:
|
|
1282
|
-
return {
|
|
1283
|
-
"success": False,
|
|
1284
|
-
"error": f"DLQ entry not found: {entry_id}",
|
|
1285
|
-
}
|
|
1286
|
-
|
|
1287
|
-
logger.info("Replaying DLQ entry",
|
|
1288
|
-
entry_id=entry_id,
|
|
1289
|
-
node_id=entry.node_id,
|
|
1290
|
-
node_type=entry.node_type,
|
|
1291
|
-
original_execution=entry.execution_id)
|
|
1292
|
-
|
|
1293
|
-
# Create new execution context for replay
|
|
1294
|
-
ctx = ExecutionContext.create(
|
|
1295
|
-
workflow_id=entry.workflow_id,
|
|
1296
|
-
session_id="dlq_replay",
|
|
1297
|
-
nodes=nodes,
|
|
1298
|
-
edges=edges,
|
|
1299
|
-
)
|
|
1300
|
-
|
|
1301
|
-
# Get the node execution
|
|
1302
|
-
node_exec = ctx.node_executions.get(entry.node_id)
|
|
1303
|
-
if not node_exec:
|
|
1304
|
-
return {
|
|
1305
|
-
"success": False,
|
|
1306
|
-
"error": f"Node not found in workflow: {entry.node_id}",
|
|
1307
|
-
}
|
|
1308
|
-
|
|
1309
|
-
# Set up context with stored inputs
|
|
1310
|
-
ctx.outputs = entry.inputs # Restore input state
|
|
1311
|
-
|
|
1312
|
-
ctx.status = WorkflowStatus.RUNNING
|
|
1313
|
-
ctx.started_at = time.time()
|
|
1314
|
-
self._active_contexts[ctx.execution_id] = ctx
|
|
1315
|
-
|
|
1316
|
-
try:
|
|
1317
|
-
# Execute the single node with retry
|
|
1318
|
-
await self._execute_single_node(ctx, node_exec, enable_caching=False)
|
|
1319
|
-
|
|
1320
|
-
if node_exec.status == TaskStatus.COMPLETED:
|
|
1321
|
-
# Success - remove from DLQ
|
|
1322
|
-
await self.cache.remove_from_dlq(entry_id)
|
|
1323
|
-
logger.info("DLQ replay succeeded",
|
|
1324
|
-
entry_id=entry_id,
|
|
1325
|
-
node_id=entry.node_id)
|
|
1326
|
-
|
|
1327
|
-
return {
|
|
1328
|
-
"success": True,
|
|
1329
|
-
"execution_id": ctx.execution_id,
|
|
1330
|
-
"node_id": entry.node_id,
|
|
1331
|
-
"result": node_exec.output,
|
|
1332
|
-
"removed_from_dlq": True,
|
|
1333
|
-
}
|
|
1334
|
-
else:
|
|
1335
|
-
# Still failing - update DLQ entry
|
|
1336
|
-
await self.cache.update_dlq_entry(
|
|
1337
|
-
entry_id,
|
|
1338
|
-
entry.retry_count + 1,
|
|
1339
|
-
node_exec.error or "Unknown error"
|
|
1340
|
-
)
|
|
1341
|
-
|
|
1342
|
-
return {
|
|
1343
|
-
"success": False,
|
|
1344
|
-
"execution_id": ctx.execution_id,
|
|
1345
|
-
"node_id": entry.node_id,
|
|
1346
|
-
"error": node_exec.error,
|
|
1347
|
-
"retry_count": entry.retry_count + 1,
|
|
1348
|
-
}
|
|
1349
|
-
|
|
1350
|
-
finally:
|
|
1351
|
-
self._active_contexts.pop(ctx.execution_id, None)
|
|
1
|
+
"""Workflow executor with Conductor decide pattern and parallel execution.
|
|
2
|
+
|
|
3
|
+
Implements:
|
|
4
|
+
- Conductor-style workflow_decide() for orchestration
|
|
5
|
+
- Prefect-style task caching for idempotency
|
|
6
|
+
- Fork/Join parallel execution with asyncio.wait (FIRST_COMPLETED pattern)
|
|
7
|
+
- Dynamic workflow branching at runtime
|
|
8
|
+
- Proper handling of long-running trigger nodes in parallel batches
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import time
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from typing import Dict, Any, List, Optional, Callable, Awaitable, Set
|
|
16
|
+
|
|
17
|
+
from core.logging import get_logger
|
|
18
|
+
from constants import WORKFLOW_TRIGGER_TYPES
|
|
19
|
+
from .models import (
|
|
20
|
+
ExecutionContext,
|
|
21
|
+
TaskStatus,
|
|
22
|
+
WorkflowStatus,
|
|
23
|
+
NodeExecution,
|
|
24
|
+
hash_inputs,
|
|
25
|
+
RetryPolicy,
|
|
26
|
+
get_retry_policy,
|
|
27
|
+
)
|
|
28
|
+
from .cache import ExecutionCache
|
|
29
|
+
from .conditions import evaluate_condition, decide_next_edges
|
|
30
|
+
from .dlq import create_dlq_handler, DLQHandlerProtocol, NullDLQHandler
|
|
31
|
+
|
|
32
|
+
logger = get_logger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def is_trigger_node(node_type: str) -> bool:
|
|
36
|
+
"""Check if a node type is a trigger node (workflow starting point).
|
|
37
|
+
|
|
38
|
+
Trigger nodes have no input handles and serve as entry points for workflows.
|
|
39
|
+
They are identified by WORKFLOW_TRIGGER_TYPES in constants.py.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
node_type: The node type string
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
True if the node is a trigger type
|
|
46
|
+
"""
|
|
47
|
+
return node_type in WORKFLOW_TRIGGER_TYPES
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class WorkflowExecutor:
|
|
51
|
+
"""Executes workflows using Conductor decide pattern with parallel execution.
|
|
52
|
+
|
|
53
|
+
Features:
|
|
54
|
+
- Isolated ExecutionContext per workflow run
|
|
55
|
+
- Parallel execution of independent nodes (Fork/Join)
|
|
56
|
+
- Result caching for idempotency (Prefect pattern)
|
|
57
|
+
- Distributed locking to prevent race conditions
|
|
58
|
+
- Event history for debugging and recovery
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, cache: ExecutionCache,
|
|
62
|
+
node_executor: Callable[[str, str, Dict, Dict], Awaitable[Dict]],
|
|
63
|
+
status_callback: Callable[[str, str, Dict], Awaitable[None]] = None,
|
|
64
|
+
dlq_enabled: bool = False):
|
|
65
|
+
"""Initialize executor.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
cache: ExecutionCache for Redis persistence
|
|
69
|
+
node_executor: Async function to execute a single node
|
|
70
|
+
Signature: async def execute(node_id, node_type, params, context) -> result
|
|
71
|
+
status_callback: Optional async callback for status updates
|
|
72
|
+
Signature: async def callback(node_id, status, data)
|
|
73
|
+
dlq_enabled: Whether to add failed nodes to Dead Letter Queue
|
|
74
|
+
"""
|
|
75
|
+
self.cache = cache
|
|
76
|
+
self.node_executor = node_executor
|
|
77
|
+
self.status_callback = status_callback
|
|
78
|
+
|
|
79
|
+
# Create DLQ handler (modular - uses Null Object pattern when disabled)
|
|
80
|
+
self.dlq = create_dlq_handler(cache, enabled=dlq_enabled)
|
|
81
|
+
|
|
82
|
+
# Active executions (in-memory for fast lookup)
|
|
83
|
+
self._active_contexts: Dict[str, ExecutionContext] = {}
|
|
84
|
+
|
|
85
|
+
# =========================================================================
|
|
86
|
+
# EXECUTION ENTRY POINTS
|
|
87
|
+
# =========================================================================
|
|
88
|
+
|
|
89
|
+
async def execute_workflow(self, workflow_id: str, nodes: List[Dict],
|
|
90
|
+
edges: List[Dict], session_id: str = "default",
|
|
91
|
+
enable_caching: bool = True) -> Dict[str, Any]:
|
|
92
|
+
"""Execute a workflow with parallel node execution.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
workflow_id: Workflow identifier
|
|
96
|
+
nodes: List of workflow nodes
|
|
97
|
+
edges: List of edges connecting nodes
|
|
98
|
+
session_id: Session identifier
|
|
99
|
+
enable_caching: Whether to use result caching
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Execution result dict
|
|
103
|
+
"""
|
|
104
|
+
start_time = time.time()
|
|
105
|
+
|
|
106
|
+
# Create isolated execution context
|
|
107
|
+
ctx = ExecutionContext.create(
|
|
108
|
+
workflow_id=workflow_id,
|
|
109
|
+
session_id=session_id,
|
|
110
|
+
nodes=nodes,
|
|
111
|
+
edges=edges,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Compute execution layers (for parallel batches)
|
|
115
|
+
ctx.execution_order = self._compute_execution_layers(nodes, edges)
|
|
116
|
+
|
|
117
|
+
logger.info("Starting workflow execution",
|
|
118
|
+
execution_id=ctx.execution_id,
|
|
119
|
+
workflow_id=workflow_id,
|
|
120
|
+
node_count=len(nodes),
|
|
121
|
+
layers=len(ctx.execution_order))
|
|
122
|
+
|
|
123
|
+
# Track in memory
|
|
124
|
+
self._active_contexts[ctx.execution_id] = ctx
|
|
125
|
+
|
|
126
|
+
# Persist initial state
|
|
127
|
+
ctx.status = WorkflowStatus.RUNNING
|
|
128
|
+
ctx.started_at = time.time()
|
|
129
|
+
await self.cache.save_execution_state(ctx)
|
|
130
|
+
|
|
131
|
+
# Add workflow_started event
|
|
132
|
+
await self.cache.add_event(ctx.execution_id, "workflow_started", {
|
|
133
|
+
"workflow_id": workflow_id,
|
|
134
|
+
"node_count": len(nodes),
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
# Run the decide loop
|
|
139
|
+
await self._workflow_decide(ctx, enable_caching)
|
|
140
|
+
|
|
141
|
+
# Determine final status
|
|
142
|
+
if ctx.all_nodes_complete():
|
|
143
|
+
ctx.status = WorkflowStatus.COMPLETED
|
|
144
|
+
elif ctx.errors:
|
|
145
|
+
ctx.status = WorkflowStatus.FAILED
|
|
146
|
+
|
|
147
|
+
ctx.completed_at = time.time()
|
|
148
|
+
await self.cache.save_execution_state(ctx)
|
|
149
|
+
|
|
150
|
+
# Add workflow_completed event
|
|
151
|
+
await self.cache.add_event(ctx.execution_id, "workflow_completed", {
|
|
152
|
+
"status": ctx.status.value,
|
|
153
|
+
"completed_nodes": len(ctx.get_completed_nodes()),
|
|
154
|
+
"execution_time": ctx.completed_at - ctx.started_at,
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
"success": ctx.status == WorkflowStatus.COMPLETED,
|
|
159
|
+
"execution_id": ctx.execution_id,
|
|
160
|
+
"status": ctx.status.value,
|
|
161
|
+
"nodes_executed": ctx.get_completed_nodes(),
|
|
162
|
+
"outputs": ctx.outputs,
|
|
163
|
+
"errors": ctx.errors,
|
|
164
|
+
"execution_time": time.time() - start_time,
|
|
165
|
+
"timestamp": datetime.now().isoformat(),
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
except asyncio.CancelledError:
|
|
169
|
+
ctx.status = WorkflowStatus.CANCELLED
|
|
170
|
+
ctx.completed_at = time.time()
|
|
171
|
+
await self.cache.save_execution_state(ctx)
|
|
172
|
+
await self.cache.add_event(ctx.execution_id, "workflow_cancelled", {})
|
|
173
|
+
return {
|
|
174
|
+
"success": False,
|
|
175
|
+
"execution_id": ctx.execution_id,
|
|
176
|
+
"status": "cancelled",
|
|
177
|
+
"error": "Cancelled by user",
|
|
178
|
+
"execution_time": time.time() - start_time,
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logger.error("Workflow execution failed", execution_id=ctx.execution_id,
|
|
183
|
+
error=str(e))
|
|
184
|
+
ctx.status = WorkflowStatus.FAILED
|
|
185
|
+
ctx.errors.append({"error": str(e), "timestamp": time.time()})
|
|
186
|
+
await self.cache.save_execution_state(ctx)
|
|
187
|
+
await self.cache.add_event(ctx.execution_id, "workflow_failed", {
|
|
188
|
+
"error": str(e),
|
|
189
|
+
})
|
|
190
|
+
return {
|
|
191
|
+
"success": False,
|
|
192
|
+
"execution_id": ctx.execution_id,
|
|
193
|
+
"status": "failed",
|
|
194
|
+
"error": str(e),
|
|
195
|
+
"execution_time": time.time() - start_time,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
finally:
|
|
199
|
+
# Cleanup
|
|
200
|
+
self._active_contexts.pop(ctx.execution_id, None)
|
|
201
|
+
|
|
202
|
+
async def cancel_execution(self, execution_id: str) -> bool:
|
|
203
|
+
"""Cancel a running execution.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
execution_id: Execution to cancel
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
True if cancelled successfully
|
|
210
|
+
"""
|
|
211
|
+
ctx = self._active_contexts.get(execution_id)
|
|
212
|
+
if ctx:
|
|
213
|
+
ctx.status = WorkflowStatus.CANCELLED
|
|
214
|
+
for node_exec in ctx.node_executions.values():
|
|
215
|
+
if node_exec.status in (TaskStatus.PENDING, TaskStatus.SCHEDULED,
|
|
216
|
+
TaskStatus.RUNNING, TaskStatus.WAITING):
|
|
217
|
+
node_exec.status = TaskStatus.CANCELLED
|
|
218
|
+
await self.cache.save_execution_state(ctx)
|
|
219
|
+
logger.info("Execution cancelled", execution_id=execution_id)
|
|
220
|
+
return True
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
# =========================================================================
|
|
224
|
+
# CONDUCTOR DECIDE PATTERN
|
|
225
|
+
# =========================================================================
|
|
226
|
+
|
|
227
|
+
async def _workflow_decide(self, ctx: ExecutionContext,
|
|
228
|
+
enable_caching: bool = True) -> None:
|
|
229
|
+
"""Core orchestration loop - Conductor's decide pattern.
|
|
230
|
+
|
|
231
|
+
Evaluates current state, finds ready nodes, executes them in parallel,
|
|
232
|
+
then recurses until all nodes complete or error occurs.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
ctx: ExecutionContext to process
|
|
236
|
+
enable_caching: Whether to use result caching
|
|
237
|
+
"""
|
|
238
|
+
# Distributed lock prevents concurrent decides for same execution
|
|
239
|
+
try:
|
|
240
|
+
async with self.cache.distributed_lock(
|
|
241
|
+
f"execution:{ctx.execution_id}:decide", timeout=60
|
|
242
|
+
):
|
|
243
|
+
await self._decide_iteration(ctx, enable_caching)
|
|
244
|
+
except TimeoutError:
|
|
245
|
+
logger.warning("Could not acquire decide lock",
|
|
246
|
+
execution_id=ctx.execution_id)
|
|
247
|
+
# Retry after short delay
|
|
248
|
+
await asyncio.sleep(0.5)
|
|
249
|
+
await self._workflow_decide(ctx, enable_caching)
|
|
250
|
+
|
|
251
|
+
async def _decide_iteration(self, ctx: ExecutionContext,
|
|
252
|
+
enable_caching: bool) -> None:
|
|
253
|
+
"""Continuous scheduling loop - Temporal/Conductor pattern.
|
|
254
|
+
|
|
255
|
+
When any node completes, immediately check for newly-ready dependents
|
|
256
|
+
and start them without waiting for entire layer to complete.
|
|
257
|
+
|
|
258
|
+
Example: Cron3 (5s) completes -> immediately start WS3,
|
|
259
|
+
even while Cron1 (20s) is still running.
|
|
260
|
+
"""
|
|
261
|
+
# Check if cancelled
|
|
262
|
+
if ctx.status == WorkflowStatus.CANCELLED:
|
|
263
|
+
return
|
|
264
|
+
|
|
265
|
+
# Find initial ready nodes
|
|
266
|
+
ready_nodes = self._find_ready_nodes(ctx)
|
|
267
|
+
|
|
268
|
+
if not ready_nodes:
|
|
269
|
+
if ctx.all_nodes_complete():
|
|
270
|
+
logger.info("All nodes complete", execution_id=ctx.execution_id)
|
|
271
|
+
else:
|
|
272
|
+
pending = ctx.get_pending_nodes()
|
|
273
|
+
if pending:
|
|
274
|
+
logger.warning("Stuck: pending nodes with unsatisfied deps",
|
|
275
|
+
execution_id=ctx.execution_id,
|
|
276
|
+
pending=pending)
|
|
277
|
+
return
|
|
278
|
+
|
|
279
|
+
logger.info("Starting continuous execution",
|
|
280
|
+
execution_id=ctx.execution_id,
|
|
281
|
+
initial_batch=len(ready_nodes),
|
|
282
|
+
nodes=[n.node_id for n in ready_nodes])
|
|
283
|
+
|
|
284
|
+
# Execute with continuous scheduling - new pattern
|
|
285
|
+
await self._execute_with_continuous_scheduling(ctx, ready_nodes, enable_caching)
|
|
286
|
+
|
|
287
|
+
# Save final state
|
|
288
|
+
await self.cache.save_execution_state(ctx)
|
|
289
|
+
|
|
290
|
+
# =========================================================================
|
|
291
|
+
# CONTINUOUS SCHEDULING (Temporal/Conductor Pattern)
|
|
292
|
+
# =========================================================================
|
|
293
|
+
|
|
294
|
+
async def _execute_with_continuous_scheduling(
|
|
295
|
+
self,
|
|
296
|
+
ctx: ExecutionContext,
|
|
297
|
+
initial_nodes: List[NodeExecution],
|
|
298
|
+
enable_caching: bool
|
|
299
|
+
) -> None:
|
|
300
|
+
"""Execute workflow with continuous scheduling.
|
|
301
|
+
|
|
302
|
+
Modern pattern: When any node completes, immediately check for and start
|
|
303
|
+
newly-ready dependent nodes. This enables true parallel pipelines where
|
|
304
|
+
each path progresses independently.
|
|
305
|
+
|
|
306
|
+
Uses asyncio.wait(FIRST_COMPLETED) to process completions immediately.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
ctx: ExecutionContext
|
|
310
|
+
initial_nodes: Initial batch of ready nodes
|
|
311
|
+
enable_caching: Whether to use result caching
|
|
312
|
+
"""
|
|
313
|
+
# Track all running tasks: task -> NodeExecution
|
|
314
|
+
task_to_node: Dict[asyncio.Task, NodeExecution] = {}
|
|
315
|
+
pending_tasks: Set[asyncio.Task] = set()
|
|
316
|
+
workflow_failed = False
|
|
317
|
+
|
|
318
|
+
def create_node_task(node: NodeExecution) -> asyncio.Task:
|
|
319
|
+
"""Create and track a task for node execution."""
|
|
320
|
+
node.status = TaskStatus.SCHEDULED
|
|
321
|
+
task = asyncio.create_task(
|
|
322
|
+
self._execute_node_with_retry(ctx, node, enable_caching),
|
|
323
|
+
name=f"node_{node.node_id}"
|
|
324
|
+
)
|
|
325
|
+
task_to_node[task] = node
|
|
326
|
+
pending_tasks.add(task)
|
|
327
|
+
return task
|
|
328
|
+
|
|
329
|
+
# Start initial nodes
|
|
330
|
+
for node in initial_nodes:
|
|
331
|
+
create_node_task(node)
|
|
332
|
+
await self._notify_status(node.node_id, "scheduled", {})
|
|
333
|
+
logger.info("Scheduled node", node_id=node.node_id)
|
|
334
|
+
|
|
335
|
+
# Process completions and schedule new nodes continuously
|
|
336
|
+
while pending_tasks and not workflow_failed:
|
|
337
|
+
if ctx.status == WorkflowStatus.CANCELLED:
|
|
338
|
+
# Cancel all pending tasks
|
|
339
|
+
for task in pending_tasks:
|
|
340
|
+
task.cancel()
|
|
341
|
+
break
|
|
342
|
+
|
|
343
|
+
# Wait for ANY task to complete
|
|
344
|
+
done, pending_tasks = await asyncio.wait(
|
|
345
|
+
pending_tasks,
|
|
346
|
+
return_when=asyncio.FIRST_COMPLETED
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Process each completed task
|
|
350
|
+
for task in done:
|
|
351
|
+
node = task_to_node[task]
|
|
352
|
+
newly_ready = []
|
|
353
|
+
|
|
354
|
+
try:
|
|
355
|
+
result = task.result()
|
|
356
|
+
|
|
357
|
+
if isinstance(result, Exception):
|
|
358
|
+
node.status = TaskStatus.FAILED
|
|
359
|
+
node.error = str(result)
|
|
360
|
+
node.completed_at = time.time()
|
|
361
|
+
ctx.errors.append({
|
|
362
|
+
"node_id": node.node_id,
|
|
363
|
+
"error": str(result),
|
|
364
|
+
"timestamp": time.time(),
|
|
365
|
+
})
|
|
366
|
+
await self._notify_status(node.node_id, "error", {"error": str(result)})
|
|
367
|
+
logger.error("Node failed", node_id=node.node_id, error=str(result))
|
|
368
|
+
workflow_failed = True
|
|
369
|
+
|
|
370
|
+
elif result.get("retries_exhausted"):
|
|
371
|
+
node.status = TaskStatus.FAILED
|
|
372
|
+
node.error = result.get("error", "Unknown error")
|
|
373
|
+
node.completed_at = time.time()
|
|
374
|
+
ctx.errors.append({
|
|
375
|
+
"node_id": node.node_id,
|
|
376
|
+
"error": node.error,
|
|
377
|
+
"retries_exhausted": True,
|
|
378
|
+
"timestamp": time.time(),
|
|
379
|
+
})
|
|
380
|
+
workflow_failed = True
|
|
381
|
+
|
|
382
|
+
elif not result.get("success"):
|
|
383
|
+
node.status = TaskStatus.FAILED
|
|
384
|
+
node.error = result.get("error", "Unknown error")
|
|
385
|
+
node.completed_at = time.time()
|
|
386
|
+
ctx.errors.append({
|
|
387
|
+
"node_id": node.node_id,
|
|
388
|
+
"error": node.error,
|
|
389
|
+
"timestamp": time.time(),
|
|
390
|
+
})
|
|
391
|
+
await self._notify_status(node.node_id, "error", {"error": node.error})
|
|
392
|
+
logger.error("Node failed", node_id=node.node_id, error=node.error)
|
|
393
|
+
workflow_failed = True
|
|
394
|
+
|
|
395
|
+
else:
|
|
396
|
+
# Success - checkpoint and find newly ready nodes
|
|
397
|
+
ctx.add_checkpoint(node.node_id)
|
|
398
|
+
logger.info("Node completed", node_id=node.node_id)
|
|
399
|
+
|
|
400
|
+
# Find nodes that are now ready (their dependencies just completed)
|
|
401
|
+
newly_ready = self._find_ready_nodes(ctx)
|
|
402
|
+
|
|
403
|
+
except asyncio.CancelledError:
|
|
404
|
+
node.status = TaskStatus.CANCELLED
|
|
405
|
+
node.completed_at = time.time()
|
|
406
|
+
logger.info("Node cancelled", node_id=node.node_id)
|
|
407
|
+
|
|
408
|
+
except Exception as e:
|
|
409
|
+
node.status = TaskStatus.FAILED
|
|
410
|
+
node.error = str(e)
|
|
411
|
+
node.completed_at = time.time()
|
|
412
|
+
ctx.errors.append({
|
|
413
|
+
"node_id": node.node_id,
|
|
414
|
+
"error": str(e),
|
|
415
|
+
"timestamp": time.time(),
|
|
416
|
+
})
|
|
417
|
+
await self._notify_status(node.node_id, "error", {"error": str(e)})
|
|
418
|
+
logger.error("Node exception", node_id=node.node_id, error=str(e))
|
|
419
|
+
workflow_failed = True
|
|
420
|
+
|
|
421
|
+
# Schedule newly ready nodes immediately
|
|
422
|
+
if newly_ready and not workflow_failed:
|
|
423
|
+
for ready_node in newly_ready:
|
|
424
|
+
create_node_task(ready_node)
|
|
425
|
+
await self._notify_status(ready_node.node_id, "scheduled", {})
|
|
426
|
+
logger.info("Scheduled dependent node",
|
|
427
|
+
node_id=ready_node.node_id,
|
|
428
|
+
triggered_by=node.node_id)
|
|
429
|
+
|
|
430
|
+
# Periodic state save
|
|
431
|
+
await self.cache.save_execution_state(ctx)
|
|
432
|
+
|
|
433
|
+
# Handle workflow failure - cancel remaining tasks
|
|
434
|
+
if workflow_failed and pending_tasks:
|
|
435
|
+
logger.info("Workflow failed, cancelling remaining tasks",
|
|
436
|
+
pending_count=len(pending_tasks))
|
|
437
|
+
|
|
438
|
+
for task in pending_tasks:
|
|
439
|
+
task.cancel()
|
|
440
|
+
|
|
441
|
+
# Wait for cancelled tasks
|
|
442
|
+
if pending_tasks:
|
|
443
|
+
cancelled_done, _ = await asyncio.wait(
|
|
444
|
+
pending_tasks,
|
|
445
|
+
return_when=asyncio.ALL_COMPLETED
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
for task in cancelled_done:
|
|
449
|
+
node = task_to_node.get(task)
|
|
450
|
+
if node and node.status not in (TaskStatus.COMPLETED, TaskStatus.FAILED):
|
|
451
|
+
node.status = TaskStatus.CANCELLED
|
|
452
|
+
node.completed_at = time.time()
|
|
453
|
+
|
|
454
|
+
ctx.status = WorkflowStatus.FAILED
|
|
455
|
+
|
|
456
|
+
# =========================================================================
|
|
457
|
+
# PARALLEL EXECUTION (Legacy - Fork/Join with FIRST_COMPLETED pattern)
|
|
458
|
+
# =========================================================================
|
|
459
|
+
|
|
460
|
+
async def _execute_parallel_nodes(self, ctx: ExecutionContext,
|
|
461
|
+
nodes: List[NodeExecution],
|
|
462
|
+
enable_caching: bool) -> None:
|
|
463
|
+
"""Execute multiple nodes in parallel using asyncio.wait with FIRST_COMPLETED.
|
|
464
|
+
|
|
465
|
+
Uses the standard asyncio pattern for mixed task types:
|
|
466
|
+
- Regular nodes complete quickly
|
|
467
|
+
- Trigger nodes wait indefinitely for external events
|
|
468
|
+
- If a regular node fails, cancel remaining trigger nodes immediately
|
|
469
|
+
|
|
470
|
+
This follows Python asyncio best practices:
|
|
471
|
+
https://docs.python.org/3/library/asyncio-task.html#asyncio.wait
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
ctx: ExecutionContext
|
|
475
|
+
nodes: List of NodeExecution to run in parallel
|
|
476
|
+
enable_caching: Whether to use result caching
|
|
477
|
+
"""
|
|
478
|
+
# Mark all as scheduled
|
|
479
|
+
for node in nodes:
|
|
480
|
+
node.status = TaskStatus.SCHEDULED
|
|
481
|
+
await self._notify_status(node.node_id, "scheduled", {})
|
|
482
|
+
|
|
483
|
+
# Create named tasks for parallel execution
|
|
484
|
+
# Using dict to track node -> task mapping for proper result handling
|
|
485
|
+
node_to_task: Dict[str, asyncio.Task] = {}
|
|
486
|
+
task_to_node: Dict[asyncio.Task, NodeExecution] = {}
|
|
487
|
+
|
|
488
|
+
for node in nodes:
|
|
489
|
+
task = asyncio.create_task(
|
|
490
|
+
self._execute_node_with_retry(ctx, node, enable_caching),
|
|
491
|
+
name=f"node_{node.node_id}"
|
|
492
|
+
)
|
|
493
|
+
node_to_task[node.node_id] = task
|
|
494
|
+
task_to_node[task] = node
|
|
495
|
+
|
|
496
|
+
pending: Set[asyncio.Task] = set(node_to_task.values())
|
|
497
|
+
workflow_failed = False
|
|
498
|
+
|
|
499
|
+
# Process tasks as they complete using FIRST_COMPLETED pattern
|
|
500
|
+
while pending:
|
|
501
|
+
# Wait for any task to complete
|
|
502
|
+
done, pending = await asyncio.wait(
|
|
503
|
+
pending,
|
|
504
|
+
return_when=asyncio.FIRST_COMPLETED
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# Process completed tasks
|
|
508
|
+
for task in done:
|
|
509
|
+
node = task_to_node[task]
|
|
510
|
+
|
|
511
|
+
try:
|
|
512
|
+
result = task.result()
|
|
513
|
+
|
|
514
|
+
if isinstance(result, Exception):
|
|
515
|
+
# Task raised exception
|
|
516
|
+
node.status = TaskStatus.FAILED
|
|
517
|
+
node.error = str(result)
|
|
518
|
+
node.completed_at = time.time()
|
|
519
|
+
ctx.errors.append({
|
|
520
|
+
"node_id": node.node_id,
|
|
521
|
+
"error": str(result),
|
|
522
|
+
"timestamp": time.time(),
|
|
523
|
+
})
|
|
524
|
+
await self._notify_status(node.node_id, "error", {"error": str(result)})
|
|
525
|
+
logger.error("Parallel node failed",
|
|
526
|
+
node_id=node.node_id, error=str(result))
|
|
527
|
+
workflow_failed = True
|
|
528
|
+
|
|
529
|
+
elif result.get("retries_exhausted"):
|
|
530
|
+
# Node failed after all retries - already in DLQ
|
|
531
|
+
node.status = TaskStatus.FAILED
|
|
532
|
+
node.error = result.get("error", "Unknown error")
|
|
533
|
+
node.completed_at = time.time()
|
|
534
|
+
ctx.errors.append({
|
|
535
|
+
"node_id": node.node_id,
|
|
536
|
+
"error": node.error,
|
|
537
|
+
"retries_exhausted": True,
|
|
538
|
+
"timestamp": time.time(),
|
|
539
|
+
})
|
|
540
|
+
workflow_failed = True
|
|
541
|
+
|
|
542
|
+
elif not result.get("success"):
|
|
543
|
+
# Node returned failure without exhausting retries
|
|
544
|
+
node.status = TaskStatus.FAILED
|
|
545
|
+
node.error = result.get("error", "Unknown error")
|
|
546
|
+
node.completed_at = time.time()
|
|
547
|
+
ctx.errors.append({
|
|
548
|
+
"node_id": node.node_id,
|
|
549
|
+
"error": node.error,
|
|
550
|
+
"timestamp": time.time(),
|
|
551
|
+
})
|
|
552
|
+
await self._notify_status(node.node_id, "error", {"error": node.error})
|
|
553
|
+
logger.error("Parallel node failed",
|
|
554
|
+
node_id=node.node_id, error=node.error)
|
|
555
|
+
workflow_failed = True
|
|
556
|
+
|
|
557
|
+
except asyncio.CancelledError:
|
|
558
|
+
# Task was cancelled (by us or externally)
|
|
559
|
+
node.status = TaskStatus.CANCELLED
|
|
560
|
+
node.completed_at = time.time()
|
|
561
|
+
logger.info("Parallel node cancelled", node_id=node.node_id)
|
|
562
|
+
|
|
563
|
+
except Exception as e:
|
|
564
|
+
# Unexpected exception from task.result()
|
|
565
|
+
node.status = TaskStatus.FAILED
|
|
566
|
+
node.error = str(e)
|
|
567
|
+
node.completed_at = time.time()
|
|
568
|
+
ctx.errors.append({
|
|
569
|
+
"node_id": node.node_id,
|
|
570
|
+
"error": str(e),
|
|
571
|
+
"timestamp": time.time(),
|
|
572
|
+
})
|
|
573
|
+
await self._notify_status(node.node_id, "error", {"error": str(e)})
|
|
574
|
+
logger.error("Parallel node exception",
|
|
575
|
+
node_id=node.node_id, error=str(e))
|
|
576
|
+
workflow_failed = True
|
|
577
|
+
|
|
578
|
+
# If workflow failed, cancel remaining pending tasks
|
|
579
|
+
# This prevents trigger nodes from blocking forever when a regular node fails
|
|
580
|
+
if workflow_failed and pending:
|
|
581
|
+
logger.info("Workflow failed, cancelling remaining tasks",
|
|
582
|
+
pending_count=len(pending))
|
|
583
|
+
|
|
584
|
+
for task in pending:
|
|
585
|
+
task.cancel()
|
|
586
|
+
|
|
587
|
+
# Wait for cancelled tasks to finish
|
|
588
|
+
if pending:
|
|
589
|
+
cancelled_done, _ = await asyncio.wait(
|
|
590
|
+
pending,
|
|
591
|
+
return_when=asyncio.ALL_COMPLETED
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
# Mark cancelled nodes
|
|
595
|
+
for task in cancelled_done:
|
|
596
|
+
node = task_to_node[task]
|
|
597
|
+
if node.status not in (TaskStatus.COMPLETED, TaskStatus.FAILED):
|
|
598
|
+
node.status = TaskStatus.CANCELLED
|
|
599
|
+
node.completed_at = time.time()
|
|
600
|
+
logger.info("Cancelled pending node", node_id=node.node_id)
|
|
601
|
+
|
|
602
|
+
pending = set() # All done now
|
|
603
|
+
|
|
604
|
+
# Mark workflow as failed if any node failed
|
|
605
|
+
if workflow_failed:
|
|
606
|
+
ctx.status = WorkflowStatus.FAILED
|
|
607
|
+
|
|
608
|
+
async def _execute_single_node(self, ctx: ExecutionContext,
|
|
609
|
+
node: NodeExecution,
|
|
610
|
+
enable_caching: bool) -> None:
|
|
611
|
+
"""Execute a single node with retry logic.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
ctx: ExecutionContext
|
|
615
|
+
node: NodeExecution to run
|
|
616
|
+
enable_caching: Whether to use result caching
|
|
617
|
+
"""
|
|
618
|
+
node.status = TaskStatus.SCHEDULED
|
|
619
|
+
await self._notify_status(node.node_id, "scheduled", {})
|
|
620
|
+
|
|
621
|
+
try:
|
|
622
|
+
result = await self._execute_node_with_retry(ctx, node, enable_caching)
|
|
623
|
+
|
|
624
|
+
if result.get("retries_exhausted"):
|
|
625
|
+
# Node failed after all retries - already in DLQ
|
|
626
|
+
node.status = TaskStatus.FAILED
|
|
627
|
+
node.error = result.get("error", "Unknown error")
|
|
628
|
+
node.completed_at = time.time()
|
|
629
|
+
ctx.errors.append({
|
|
630
|
+
"node_id": node.node_id,
|
|
631
|
+
"error": node.error,
|
|
632
|
+
"retries_exhausted": True,
|
|
633
|
+
"timestamp": time.time(),
|
|
634
|
+
})
|
|
635
|
+
ctx.status = WorkflowStatus.FAILED
|
|
636
|
+
|
|
637
|
+
except Exception as e:
|
|
638
|
+
node.status = TaskStatus.FAILED
|
|
639
|
+
node.error = str(e)
|
|
640
|
+
node.completed_at = time.time()
|
|
641
|
+
ctx.errors.append({
|
|
642
|
+
"node_id": node.node_id,
|
|
643
|
+
"error": str(e),
|
|
644
|
+
"timestamp": time.time(),
|
|
645
|
+
})
|
|
646
|
+
await self._notify_status(node.node_id, "error", {"error": str(e)})
|
|
647
|
+
ctx.status = WorkflowStatus.FAILED
|
|
648
|
+
|
|
649
|
+
# =========================================================================
|
|
650
|
+
# RETRY LOGIC
|
|
651
|
+
# =========================================================================
|
|
652
|
+
|
|
653
|
+
async def _execute_node_with_retry(self, ctx: ExecutionContext,
|
|
654
|
+
node: NodeExecution,
|
|
655
|
+
enable_caching: bool) -> Dict[str, Any]:
|
|
656
|
+
"""Execute node with retry logic and DLQ on final failure.
|
|
657
|
+
|
|
658
|
+
Uses exponential backoff retry policy based on node type.
|
|
659
|
+
On exhausted retries, adds entry to Dead Letter Queue.
|
|
660
|
+
|
|
661
|
+
Args:
|
|
662
|
+
ctx: ExecutionContext
|
|
663
|
+
node: NodeExecution to run
|
|
664
|
+
enable_caching: Whether to use result caching
|
|
665
|
+
|
|
666
|
+
Returns:
|
|
667
|
+
Execution result
|
|
668
|
+
"""
|
|
669
|
+
# Get retry policy for this node type
|
|
670
|
+
node_data = self._get_node_data(ctx, node.node_id)
|
|
671
|
+
custom_policy = node_data.get("parameters", {}).get("retryPolicy")
|
|
672
|
+
retry_policy = get_retry_policy(node.node_type, custom_policy)
|
|
673
|
+
|
|
674
|
+
last_error = None
|
|
675
|
+
inputs = self._gather_node_inputs(ctx, node.node_id)
|
|
676
|
+
|
|
677
|
+
for attempt in range(retry_policy.max_attempts):
|
|
678
|
+
try:
|
|
679
|
+
node.retry_count = attempt
|
|
680
|
+
result = await self._execute_node_with_caching(ctx, node, enable_caching)
|
|
681
|
+
|
|
682
|
+
# Success - return result
|
|
683
|
+
if result.get("success"):
|
|
684
|
+
return result
|
|
685
|
+
|
|
686
|
+
# Execution returned failure (not exception)
|
|
687
|
+
error = result.get("error", "Unknown error")
|
|
688
|
+
last_error = error
|
|
689
|
+
|
|
690
|
+
# Check if we should retry
|
|
691
|
+
if retry_policy.should_retry(error, attempt + 1):
|
|
692
|
+
delay = retry_policy.calculate_delay(attempt)
|
|
693
|
+
logger.info("Retrying node after failure",
|
|
694
|
+
node_id=node.node_id,
|
|
695
|
+
attempt=attempt + 1,
|
|
696
|
+
max_attempts=retry_policy.max_attempts,
|
|
697
|
+
delay=delay,
|
|
698
|
+
error=error[:100])
|
|
699
|
+
|
|
700
|
+
await self._notify_status(node.node_id, "retrying", {
|
|
701
|
+
"attempt": attempt + 1,
|
|
702
|
+
"max_attempts": retry_policy.max_attempts,
|
|
703
|
+
"delay": delay,
|
|
704
|
+
"error": error,
|
|
705
|
+
})
|
|
706
|
+
|
|
707
|
+
await asyncio.sleep(delay)
|
|
708
|
+
|
|
709
|
+
# Reset node status for retry
|
|
710
|
+
node.status = TaskStatus.PENDING
|
|
711
|
+
node.error = None
|
|
712
|
+
continue
|
|
713
|
+
else:
|
|
714
|
+
# Not retryable, break out
|
|
715
|
+
break
|
|
716
|
+
|
|
717
|
+
except asyncio.CancelledError:
|
|
718
|
+
raise # Propagate cancellation
|
|
719
|
+
except Exception as e:
|
|
720
|
+
last_error = str(e)
|
|
721
|
+
logger.warning("Node execution exception",
|
|
722
|
+
node_id=node.node_id,
|
|
723
|
+
attempt=attempt + 1,
|
|
724
|
+
error=last_error)
|
|
725
|
+
|
|
726
|
+
# Check if we should retry
|
|
727
|
+
if retry_policy.should_retry(last_error, attempt + 1):
|
|
728
|
+
delay = retry_policy.calculate_delay(attempt)
|
|
729
|
+
logger.info("Retrying node after exception",
|
|
730
|
+
node_id=node.node_id,
|
|
731
|
+
attempt=attempt + 1,
|
|
732
|
+
delay=delay)
|
|
733
|
+
|
|
734
|
+
await asyncio.sleep(delay)
|
|
735
|
+
node.status = TaskStatus.PENDING
|
|
736
|
+
node.error = None
|
|
737
|
+
continue
|
|
738
|
+
else:
|
|
739
|
+
break
|
|
740
|
+
|
|
741
|
+
# All retries exhausted - add to DLQ (handler is no-op if disabled)
|
|
742
|
+
await self.dlq.add_failed_node(ctx, node, inputs, last_error or "Unknown error")
|
|
743
|
+
|
|
744
|
+
# Return failure result
|
|
745
|
+
return {
|
|
746
|
+
"success": False,
|
|
747
|
+
"error": last_error or "Unknown error",
|
|
748
|
+
"retries_exhausted": True,
|
|
749
|
+
"retry_count": node.retry_count,
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
# =========================================================================
|
|
753
|
+
# CACHED NODE EXECUTION (Prefect pattern)
|
|
754
|
+
# =========================================================================
|
|
755
|
+
|
|
756
|
+
async def _execute_node_with_caching(self, ctx: ExecutionContext,
|
|
757
|
+
node: NodeExecution,
|
|
758
|
+
enable_caching: bool) -> Dict[str, Any]:
|
|
759
|
+
"""Execute node with result caching (Prefect pattern).
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
ctx: ExecutionContext
|
|
763
|
+
node: NodeExecution to run
|
|
764
|
+
enable_caching: Whether to check cache
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
Execution result
|
|
768
|
+
"""
|
|
769
|
+
# Get node parameters and inputs
|
|
770
|
+
node_data = self._get_node_data(ctx, node.node_id)
|
|
771
|
+
inputs = self._gather_node_inputs(ctx, node.node_id)
|
|
772
|
+
|
|
773
|
+
# Check cache first (Prefect pattern)
|
|
774
|
+
if enable_caching:
|
|
775
|
+
cached_result = await self.cache.get_cached_result(
|
|
776
|
+
ctx.execution_id, node.node_id, inputs
|
|
777
|
+
)
|
|
778
|
+
if cached_result:
|
|
779
|
+
logger.info("Cache hit", node_id=node.node_id)
|
|
780
|
+
node.status = TaskStatus.CACHED
|
|
781
|
+
node.output = cached_result
|
|
782
|
+
node.input_hash = hash_inputs(inputs)
|
|
783
|
+
node.completed_at = time.time()
|
|
784
|
+
ctx.outputs[node.node_id] = cached_result
|
|
785
|
+
await self._notify_status(node.node_id, "success",
|
|
786
|
+
{"cached": True, **cached_result})
|
|
787
|
+
await self.cache.add_event(ctx.execution_id, "node_cached", {
|
|
788
|
+
"node_id": node.node_id,
|
|
789
|
+
})
|
|
790
|
+
return cached_result
|
|
791
|
+
|
|
792
|
+
# Execute node
|
|
793
|
+
node.status = TaskStatus.RUNNING
|
|
794
|
+
node.started_at = time.time()
|
|
795
|
+
node.input_hash = hash_inputs(inputs)
|
|
796
|
+
await self._notify_status(node.node_id, "executing", {})
|
|
797
|
+
await self.cache.add_event(ctx.execution_id, "node_started", {
|
|
798
|
+
"node_id": node.node_id,
|
|
799
|
+
"node_type": node.node_type,
|
|
800
|
+
})
|
|
801
|
+
|
|
802
|
+
# Update heartbeat (for crash detection)
|
|
803
|
+
await self.cache.update_heartbeat(ctx.execution_id, node.node_id)
|
|
804
|
+
|
|
805
|
+
# Build execution context for node handler
|
|
806
|
+
# workflow_id is included for per-workflow status scoping (n8n pattern)
|
|
807
|
+
exec_context = {
|
|
808
|
+
"nodes": ctx.nodes,
|
|
809
|
+
"edges": ctx.edges,
|
|
810
|
+
"session_id": ctx.session_id,
|
|
811
|
+
"execution_id": ctx.execution_id,
|
|
812
|
+
"workflow_id": ctx.workflow_id, # For per-workflow status broadcasts
|
|
813
|
+
"start_time": node.started_at,
|
|
814
|
+
"outputs": ctx.outputs, # Previous node outputs
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
# Call the actual node executor
|
|
818
|
+
result = await self.node_executor(
|
|
819
|
+
node.node_id,
|
|
820
|
+
node.node_type,
|
|
821
|
+
node_data.get("parameters", {}),
|
|
822
|
+
exec_context
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
# Process result
|
|
826
|
+
if result.get("success"):
|
|
827
|
+
node.status = TaskStatus.COMPLETED
|
|
828
|
+
node.output = result.get("result", {})
|
|
829
|
+
node.completed_at = time.time()
|
|
830
|
+
ctx.outputs[node.node_id] = node.output
|
|
831
|
+
|
|
832
|
+
# Cache result (Prefect pattern)
|
|
833
|
+
if enable_caching:
|
|
834
|
+
await self.cache.set_cached_result(
|
|
835
|
+
ctx.execution_id, node.node_id, inputs, node.output
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
await self._notify_status(node.node_id, "success", node.output)
|
|
839
|
+
await self.cache.add_event(ctx.execution_id, "node_completed", {
|
|
840
|
+
"node_id": node.node_id,
|
|
841
|
+
"execution_time": node.completed_at - node.started_at,
|
|
842
|
+
})
|
|
843
|
+
else:
|
|
844
|
+
node.status = TaskStatus.FAILED
|
|
845
|
+
node.error = result.get("error", "Unknown error")
|
|
846
|
+
node.completed_at = time.time()
|
|
847
|
+
ctx.errors.append({
|
|
848
|
+
"node_id": node.node_id,
|
|
849
|
+
"error": node.error,
|
|
850
|
+
"timestamp": time.time(),
|
|
851
|
+
})
|
|
852
|
+
|
|
853
|
+
await self._notify_status(node.node_id, "error", {"error": node.error})
|
|
854
|
+
await self.cache.add_event(ctx.execution_id, "node_failed", {
|
|
855
|
+
"node_id": node.node_id,
|
|
856
|
+
"error": node.error,
|
|
857
|
+
})
|
|
858
|
+
|
|
859
|
+
# Mark workflow as failed
|
|
860
|
+
ctx.status = WorkflowStatus.FAILED
|
|
861
|
+
|
|
862
|
+
return result
|
|
863
|
+
|
|
864
|
+
# =========================================================================
|
|
865
|
+
# DAG ANALYSIS
|
|
866
|
+
# =========================================================================
|
|
867
|
+
|
|
868
|
+
def _compute_execution_layers(self, nodes: List[Dict],
|
|
869
|
+
edges: List[Dict]) -> List[List[str]]:
|
|
870
|
+
"""Compute execution layers for parallel execution.
|
|
871
|
+
|
|
872
|
+
Nodes in the same layer have no dependencies on each other
|
|
873
|
+
and can execute in parallel. Layer 0 contains trigger nodes
|
|
874
|
+
(workflow starting points with no input handles).
|
|
875
|
+
|
|
876
|
+
Following n8n pattern: Trigger nodes are the starting point of every
|
|
877
|
+
workflow. They listen for specific events/conditions and initiate
|
|
878
|
+
the execution of the entire workflow.
|
|
879
|
+
|
|
880
|
+
Config nodes and toolkit sub-nodes are excluded from layers since
|
|
881
|
+
they don't execute as independent workflow nodes.
|
|
882
|
+
|
|
883
|
+
Args:
|
|
884
|
+
nodes: List of workflow nodes
|
|
885
|
+
edges: List of edges
|
|
886
|
+
|
|
887
|
+
Returns:
|
|
888
|
+
List of layers, where each layer is a list of node IDs
|
|
889
|
+
"""
|
|
890
|
+
from constants import CONFIG_NODE_TYPES, TOOLKIT_NODE_TYPES
|
|
891
|
+
|
|
892
|
+
# Build node type lookup for trigger detection
|
|
893
|
+
node_types: Dict[str, str] = {
|
|
894
|
+
node["id"]: node.get("type", "unknown") for node in nodes
|
|
895
|
+
}
|
|
896
|
+
|
|
897
|
+
# Find toolkit sub-nodes (nodes that connect TO a toolkit)
|
|
898
|
+
toolkit_node_ids = {n.get("id") for n in nodes if n.get("type") in TOOLKIT_NODE_TYPES}
|
|
899
|
+
|
|
900
|
+
# Find AI Agent nodes (both aiAgent and chatAgent have config handles)
|
|
901
|
+
ai_agent_node_ids = {n.get("id") for n in nodes if n.get("type") in ('aiAgent', 'chatAgent')}
|
|
902
|
+
|
|
903
|
+
subnode_ids: set = set()
|
|
904
|
+
for edge in edges:
|
|
905
|
+
source = edge.get("source")
|
|
906
|
+
target = edge.get("target")
|
|
907
|
+
target_handle = edge.get("targetHandle")
|
|
908
|
+
|
|
909
|
+
# Any node that connects TO a toolkit is a sub-node
|
|
910
|
+
if target in toolkit_node_ids and source:
|
|
911
|
+
subnode_ids.add(source)
|
|
912
|
+
|
|
913
|
+
# Nodes connected to AI Agent/Zeenie config handles are sub-nodes
|
|
914
|
+
# These handles: input-memory, input-tools, input-skill
|
|
915
|
+
if target in ai_agent_node_ids and source and target_handle:
|
|
916
|
+
if target_handle in ('input-memory', 'input-tools', 'input-skill'):
|
|
917
|
+
subnode_ids.add(source)
|
|
918
|
+
|
|
919
|
+
# Filter out config nodes and sub-nodes from execution
|
|
920
|
+
excluded_ids = set()
|
|
921
|
+
for node in nodes:
|
|
922
|
+
node_id = node.get("id")
|
|
923
|
+
node_type = node.get("type", "unknown")
|
|
924
|
+
if node_type in CONFIG_NODE_TYPES or node_id in subnode_ids:
|
|
925
|
+
excluded_ids.add(node_id)
|
|
926
|
+
|
|
927
|
+
# Build adjacency and in-degree maps (excluding filtered nodes)
|
|
928
|
+
in_degree: Dict[str, int] = defaultdict(int)
|
|
929
|
+
adjacency: Dict[str, List[str]] = defaultdict(list)
|
|
930
|
+
node_ids = {node["id"] for node in nodes if node["id"] not in excluded_ids}
|
|
931
|
+
|
|
932
|
+
for edge in edges:
|
|
933
|
+
source = edge.get("source")
|
|
934
|
+
target = edge.get("target")
|
|
935
|
+
if source in node_ids and target in node_ids:
|
|
936
|
+
adjacency[source].append(target)
|
|
937
|
+
in_degree[target] += 1
|
|
938
|
+
|
|
939
|
+
# Initialize in-degree for all nodes
|
|
940
|
+
for node_id in node_ids:
|
|
941
|
+
if node_id not in in_degree:
|
|
942
|
+
in_degree[node_id] = 0
|
|
943
|
+
|
|
944
|
+
# Kahn's algorithm for topological sort with layers
|
|
945
|
+
layers = []
|
|
946
|
+
remaining = set(node_ids)
|
|
947
|
+
is_first_layer = True
|
|
948
|
+
|
|
949
|
+
while remaining:
|
|
950
|
+
# Find all nodes with in-degree 0 (no dependencies)
|
|
951
|
+
layer = [
|
|
952
|
+
node_id for node_id in remaining
|
|
953
|
+
if in_degree[node_id] == 0
|
|
954
|
+
]
|
|
955
|
+
|
|
956
|
+
if not layer:
|
|
957
|
+
# Cycle detected or stuck
|
|
958
|
+
logger.warning("Cycle detected or no start nodes",
|
|
959
|
+
remaining=list(remaining))
|
|
960
|
+
# Add remaining as single layer to avoid infinite loop
|
|
961
|
+
layers.append(list(remaining))
|
|
962
|
+
break
|
|
963
|
+
|
|
964
|
+
# For layer 0, validate that starting nodes are trigger nodes
|
|
965
|
+
if is_first_layer:
|
|
966
|
+
trigger_nodes = []
|
|
967
|
+
non_trigger_nodes = []
|
|
968
|
+
|
|
969
|
+
for node_id in layer:
|
|
970
|
+
node_type = node_types.get(node_id, "unknown")
|
|
971
|
+
if is_trigger_node(node_type):
|
|
972
|
+
trigger_nodes.append(node_id)
|
|
973
|
+
else:
|
|
974
|
+
non_trigger_nodes.append(node_id)
|
|
975
|
+
logger.warning(
|
|
976
|
+
"Non-trigger node found at graph entry point",
|
|
977
|
+
node_id=node_id,
|
|
978
|
+
node_type=node_type,
|
|
979
|
+
expected_types=list(WORKFLOW_TRIGGER_TYPES)
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
# Log trigger node identification
|
|
983
|
+
if trigger_nodes:
|
|
984
|
+
logger.info(
|
|
985
|
+
"Identified trigger nodes as workflow starting points",
|
|
986
|
+
trigger_count=len(trigger_nodes),
|
|
987
|
+
trigger_nodes=[
|
|
988
|
+
f"{nid[:8]}({node_types.get(nid)})"
|
|
989
|
+
for nid in trigger_nodes
|
|
990
|
+
]
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
is_first_layer = False
|
|
994
|
+
|
|
995
|
+
layers.append(layer)
|
|
996
|
+
|
|
997
|
+
# Remove layer nodes and update in-degrees
|
|
998
|
+
for node_id in layer:
|
|
999
|
+
remaining.remove(node_id)
|
|
1000
|
+
for successor in adjacency[node_id]:
|
|
1001
|
+
in_degree[successor] -= 1
|
|
1002
|
+
|
|
1003
|
+
logger.debug("Computed execution layers",
|
|
1004
|
+
layer_count=len(layers),
|
|
1005
|
+
layers=[[n[:8] for n in l] for l in layers])
|
|
1006
|
+
|
|
1007
|
+
return layers
|
|
1008
|
+
|
|
1009
|
+
def _find_ready_nodes(self, ctx: ExecutionContext) -> List[NodeExecution]:
|
|
1010
|
+
"""Find nodes ready to execute (dependencies satisfied + conditions met).
|
|
1011
|
+
|
|
1012
|
+
A node is ready if:
|
|
1013
|
+
- Status is PENDING
|
|
1014
|
+
- Not disabled (n8n-style disable feature)
|
|
1015
|
+
- All upstream nodes are COMPLETED, CACHED, or SKIPPED
|
|
1016
|
+
- Edge conditions (if any) evaluate to True based on upstream outputs
|
|
1017
|
+
|
|
1018
|
+
Supports runtime conditional branching (Prefect-style dynamic workflows).
|
|
1019
|
+
|
|
1020
|
+
Args:
|
|
1021
|
+
ctx: ExecutionContext
|
|
1022
|
+
|
|
1023
|
+
Returns:
|
|
1024
|
+
List of NodeExecution ready to run
|
|
1025
|
+
"""
|
|
1026
|
+
from constants import CONFIG_NODE_TYPES
|
|
1027
|
+
|
|
1028
|
+
# Build set of completed nodes
|
|
1029
|
+
completed = set(ctx.get_completed_nodes())
|
|
1030
|
+
|
|
1031
|
+
# Build map of node_id -> node_type for config node detection
|
|
1032
|
+
node_types: Dict[str, str] = {}
|
|
1033
|
+
for node in ctx.nodes:
|
|
1034
|
+
node_types[node.get("id", "")] = node.get("type", "unknown")
|
|
1035
|
+
|
|
1036
|
+
# Build dependency map and track conditional edges
|
|
1037
|
+
# Skip edges from config nodes (they don't execute, provide config only)
|
|
1038
|
+
dependencies: Dict[str, Set[str]] = defaultdict(set)
|
|
1039
|
+
conditional_edges: Dict[str, List[Dict]] = defaultdict(list) # target -> edges with conditions
|
|
1040
|
+
|
|
1041
|
+
for edge in ctx.edges:
|
|
1042
|
+
target = edge.get("target")
|
|
1043
|
+
source = edge.get("source")
|
|
1044
|
+
if target and source:
|
|
1045
|
+
# Skip edges from config nodes - they provide configuration, not execution dependencies
|
|
1046
|
+
source_type = node_types.get(source, "unknown")
|
|
1047
|
+
if source_type in CONFIG_NODE_TYPES:
|
|
1048
|
+
continue
|
|
1049
|
+
|
|
1050
|
+
dependencies[target].add(source)
|
|
1051
|
+
# Track edges with conditions for evaluation
|
|
1052
|
+
if edge.get("data", {}).get("condition"):
|
|
1053
|
+
conditional_edges[target].append(edge)
|
|
1054
|
+
|
|
1055
|
+
# Find ready nodes
|
|
1056
|
+
ready = []
|
|
1057
|
+
for node_id, node_exec in ctx.node_executions.items():
|
|
1058
|
+
if node_exec.status != TaskStatus.PENDING:
|
|
1059
|
+
continue
|
|
1060
|
+
|
|
1061
|
+
# Check if all dependencies are satisfied
|
|
1062
|
+
deps = dependencies.get(node_id, set())
|
|
1063
|
+
if not deps <= completed: # Not all deps completed
|
|
1064
|
+
continue
|
|
1065
|
+
|
|
1066
|
+
# Check if node is disabled (n8n-style disable)
|
|
1067
|
+
node_data = self._get_node_data(ctx, node_id)
|
|
1068
|
+
if node_data.get("data", {}).get("disabled"):
|
|
1069
|
+
node_exec.status = TaskStatus.SKIPPED
|
|
1070
|
+
node_exec.completed_at = time.time()
|
|
1071
|
+
logger.debug("Skipping disabled node", node_id=node_id)
|
|
1072
|
+
# Notify status callback about skipped node
|
|
1073
|
+
asyncio.create_task(self._notify_status(node_id, "skipped", {"disabled": True}))
|
|
1074
|
+
continue
|
|
1075
|
+
|
|
1076
|
+
# Check conditional edges for this node
|
|
1077
|
+
if node_id in conditional_edges:
|
|
1078
|
+
# Has conditional incoming edges - evaluate them
|
|
1079
|
+
conditions_met = self._evaluate_incoming_conditions(
|
|
1080
|
+
ctx, node_id, conditional_edges[node_id]
|
|
1081
|
+
)
|
|
1082
|
+
if not conditions_met:
|
|
1083
|
+
# Mark as SKIPPED if conditions not met and all deps done
|
|
1084
|
+
node_exec.status = TaskStatus.SKIPPED
|
|
1085
|
+
logger.info("Node skipped due to unmet conditions",
|
|
1086
|
+
node_id=node_id)
|
|
1087
|
+
continue
|
|
1088
|
+
|
|
1089
|
+
ready.append(node_exec)
|
|
1090
|
+
|
|
1091
|
+
return ready
|
|
1092
|
+
|
|
1093
|
+
def _evaluate_incoming_conditions(self, ctx: ExecutionContext, target_node_id: str,
|
|
1094
|
+
edges: List[Dict]) -> bool:
|
|
1095
|
+
"""Evaluate conditions on incoming edges to determine if node should run.
|
|
1096
|
+
|
|
1097
|
+
Args:
|
|
1098
|
+
ctx: ExecutionContext
|
|
1099
|
+
target_node_id: The node we're checking
|
|
1100
|
+
edges: Incoming edges with conditions
|
|
1101
|
+
|
|
1102
|
+
Returns:
|
|
1103
|
+
True if at least one conditional edge evaluates to True
|
|
1104
|
+
"""
|
|
1105
|
+
for edge in edges:
|
|
1106
|
+
source_id = edge.get("source")
|
|
1107
|
+
condition = edge.get("data", {}).get("condition")
|
|
1108
|
+
|
|
1109
|
+
if not condition:
|
|
1110
|
+
continue
|
|
1111
|
+
|
|
1112
|
+
# Get output from source node
|
|
1113
|
+
source_output = ctx.outputs.get(source_id, {})
|
|
1114
|
+
|
|
1115
|
+
# Evaluate condition
|
|
1116
|
+
if evaluate_condition(condition, source_output):
|
|
1117
|
+
logger.debug("Conditional edge matched",
|
|
1118
|
+
source=source_id,
|
|
1119
|
+
target=target_node_id,
|
|
1120
|
+
condition=condition)
|
|
1121
|
+
return True
|
|
1122
|
+
|
|
1123
|
+
# No conditions matched
|
|
1124
|
+
logger.debug("No conditional edges matched",
|
|
1125
|
+
target=target_node_id,
|
|
1126
|
+
edge_count=len(edges))
|
|
1127
|
+
return False
|
|
1128
|
+
|
|
1129
|
+
def _get_node_data(self, ctx: ExecutionContext, node_id: str) -> Dict[str, Any]:
|
|
1130
|
+
"""Get node data from context.
|
|
1131
|
+
|
|
1132
|
+
Args:
|
|
1133
|
+
ctx: ExecutionContext
|
|
1134
|
+
node_id: Node ID
|
|
1135
|
+
|
|
1136
|
+
Returns:
|
|
1137
|
+
Node data dict
|
|
1138
|
+
"""
|
|
1139
|
+
for node in ctx.nodes:
|
|
1140
|
+
if node.get("id") == node_id:
|
|
1141
|
+
return node
|
|
1142
|
+
return {}
|
|
1143
|
+
|
|
1144
|
+
def _gather_node_inputs(self, ctx: ExecutionContext, node_id: str) -> Dict[str, Any]:
|
|
1145
|
+
"""Gather inputs for a node from upstream outputs.
|
|
1146
|
+
|
|
1147
|
+
Args:
|
|
1148
|
+
ctx: ExecutionContext
|
|
1149
|
+
node_id: Target node ID
|
|
1150
|
+
|
|
1151
|
+
Returns:
|
|
1152
|
+
Dict of upstream outputs keyed by source node type
|
|
1153
|
+
"""
|
|
1154
|
+
inputs = {}
|
|
1155
|
+
for edge in ctx.edges:
|
|
1156
|
+
if edge.get("target") == node_id:
|
|
1157
|
+
source_id = edge.get("source")
|
|
1158
|
+
if source_id in ctx.outputs:
|
|
1159
|
+
# Find source node type
|
|
1160
|
+
source_node = self._get_node_data(ctx, source_id)
|
|
1161
|
+
source_type = source_node.get("type", source_id)
|
|
1162
|
+
inputs[source_type] = ctx.outputs[source_id]
|
|
1163
|
+
return inputs
|
|
1164
|
+
|
|
1165
|
+
# =========================================================================
|
|
1166
|
+
# STATUS NOTIFICATIONS
|
|
1167
|
+
# =========================================================================
|
|
1168
|
+
|
|
1169
|
+
async def _notify_status(self, node_id: str, status: str,
|
|
1170
|
+
data: Dict[str, Any]) -> None:
|
|
1171
|
+
"""Send status notification via callback.
|
|
1172
|
+
|
|
1173
|
+
Args:
|
|
1174
|
+
node_id: Node ID
|
|
1175
|
+
status: Status string
|
|
1176
|
+
data: Additional data
|
|
1177
|
+
"""
|
|
1178
|
+
if self.status_callback:
|
|
1179
|
+
try:
|
|
1180
|
+
await self.status_callback(node_id, status, data)
|
|
1181
|
+
except Exception as e:
|
|
1182
|
+
logger.warning("Status callback failed", node_id=node_id, error=str(e))
|
|
1183
|
+
|
|
1184
|
+
# =========================================================================
|
|
1185
|
+
# RECOVERY
|
|
1186
|
+
# =========================================================================
|
|
1187
|
+
|
|
1188
|
+
async def recover_execution(self, execution_id: str,
|
|
1189
|
+
nodes: List[Dict],
|
|
1190
|
+
edges: List[Dict]) -> Optional[Dict[str, Any]]:
|
|
1191
|
+
"""Recover and resume an interrupted execution.
|
|
1192
|
+
|
|
1193
|
+
Args:
|
|
1194
|
+
execution_id: Execution ID to recover
|
|
1195
|
+
nodes: Workflow nodes
|
|
1196
|
+
edges: Workflow edges
|
|
1197
|
+
|
|
1198
|
+
Returns:
|
|
1199
|
+
Execution result if resumed, None if not found
|
|
1200
|
+
"""
|
|
1201
|
+
ctx = await self.cache.load_execution_state(execution_id, nodes, edges)
|
|
1202
|
+
if not ctx:
|
|
1203
|
+
logger.warning("Execution not found for recovery", execution_id=execution_id)
|
|
1204
|
+
return None
|
|
1205
|
+
|
|
1206
|
+
if ctx.status != WorkflowStatus.RUNNING:
|
|
1207
|
+
logger.info("Execution already complete", execution_id=execution_id,
|
|
1208
|
+
status=ctx.status.value)
|
|
1209
|
+
return {
|
|
1210
|
+
"success": ctx.status == WorkflowStatus.COMPLETED,
|
|
1211
|
+
"execution_id": execution_id,
|
|
1212
|
+
"status": ctx.status.value,
|
|
1213
|
+
"recovered": False,
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1216
|
+
logger.info("Recovering execution",
|
|
1217
|
+
execution_id=execution_id,
|
|
1218
|
+
checkpoints=ctx.checkpoints)
|
|
1219
|
+
|
|
1220
|
+
# Reset any RUNNING nodes to PENDING (they were interrupted)
|
|
1221
|
+
for node_exec in ctx.node_executions.values():
|
|
1222
|
+
if node_exec.status == TaskStatus.RUNNING:
|
|
1223
|
+
node_exec.status = TaskStatus.PENDING
|
|
1224
|
+
node_exec.started_at = None
|
|
1225
|
+
|
|
1226
|
+
# Track in memory
|
|
1227
|
+
self._active_contexts[ctx.execution_id] = ctx
|
|
1228
|
+
|
|
1229
|
+
# Resume decide loop
|
|
1230
|
+
try:
|
|
1231
|
+
await self._workflow_decide(ctx, enable_caching=True)
|
|
1232
|
+
|
|
1233
|
+
if ctx.all_nodes_complete():
|
|
1234
|
+
ctx.status = WorkflowStatus.COMPLETED
|
|
1235
|
+
elif ctx.errors:
|
|
1236
|
+
ctx.status = WorkflowStatus.FAILED
|
|
1237
|
+
|
|
1238
|
+
ctx.completed_at = time.time()
|
|
1239
|
+
await self.cache.save_execution_state(ctx)
|
|
1240
|
+
|
|
1241
|
+
return {
|
|
1242
|
+
"success": ctx.status == WorkflowStatus.COMPLETED,
|
|
1243
|
+
"execution_id": ctx.execution_id,
|
|
1244
|
+
"status": ctx.status.value,
|
|
1245
|
+
"recovered": True,
|
|
1246
|
+
"outputs": ctx.outputs,
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
finally:
|
|
1250
|
+
self._active_contexts.pop(ctx.execution_id, None)
|
|
1251
|
+
|
|
1252
|
+
async def get_active_executions(self) -> List[str]:
|
|
1253
|
+
"""Get list of active execution IDs.
|
|
1254
|
+
|
|
1255
|
+
Returns:
|
|
1256
|
+
List of execution IDs currently running
|
|
1257
|
+
"""
|
|
1258
|
+
return list(self._active_contexts.keys())
|
|
1259
|
+
|
|
1260
|
+
# =========================================================================
|
|
1261
|
+
# DLQ REPLAY
|
|
1262
|
+
# =========================================================================
|
|
1263
|
+
|
|
1264
|
+
async def replay_dlq_entry(self, entry_id: str,
|
|
1265
|
+
nodes: List[Dict],
|
|
1266
|
+
edges: List[Dict]) -> Dict[str, Any]:
|
|
1267
|
+
"""Replay a failed node from the Dead Letter Queue.
|
|
1268
|
+
|
|
1269
|
+
Creates a new execution context and attempts to re-execute the failed node.
|
|
1270
|
+
|
|
1271
|
+
Args:
|
|
1272
|
+
entry_id: DLQ entry ID to replay
|
|
1273
|
+
nodes: Workflow nodes
|
|
1274
|
+
edges: Workflow edges
|
|
1275
|
+
|
|
1276
|
+
Returns:
|
|
1277
|
+
Execution result dict
|
|
1278
|
+
"""
|
|
1279
|
+
# Get DLQ entry
|
|
1280
|
+
entry = await self.cache.get_dlq_entry(entry_id)
|
|
1281
|
+
if not entry:
|
|
1282
|
+
return {
|
|
1283
|
+
"success": False,
|
|
1284
|
+
"error": f"DLQ entry not found: {entry_id}",
|
|
1285
|
+
}
|
|
1286
|
+
|
|
1287
|
+
logger.info("Replaying DLQ entry",
|
|
1288
|
+
entry_id=entry_id,
|
|
1289
|
+
node_id=entry.node_id,
|
|
1290
|
+
node_type=entry.node_type,
|
|
1291
|
+
original_execution=entry.execution_id)
|
|
1292
|
+
|
|
1293
|
+
# Create new execution context for replay
|
|
1294
|
+
ctx = ExecutionContext.create(
|
|
1295
|
+
workflow_id=entry.workflow_id,
|
|
1296
|
+
session_id="dlq_replay",
|
|
1297
|
+
nodes=nodes,
|
|
1298
|
+
edges=edges,
|
|
1299
|
+
)
|
|
1300
|
+
|
|
1301
|
+
# Get the node execution
|
|
1302
|
+
node_exec = ctx.node_executions.get(entry.node_id)
|
|
1303
|
+
if not node_exec:
|
|
1304
|
+
return {
|
|
1305
|
+
"success": False,
|
|
1306
|
+
"error": f"Node not found in workflow: {entry.node_id}",
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1309
|
+
# Set up context with stored inputs
|
|
1310
|
+
ctx.outputs = entry.inputs # Restore input state
|
|
1311
|
+
|
|
1312
|
+
ctx.status = WorkflowStatus.RUNNING
|
|
1313
|
+
ctx.started_at = time.time()
|
|
1314
|
+
self._active_contexts[ctx.execution_id] = ctx
|
|
1315
|
+
|
|
1316
|
+
try:
|
|
1317
|
+
# Execute the single node with retry
|
|
1318
|
+
await self._execute_single_node(ctx, node_exec, enable_caching=False)
|
|
1319
|
+
|
|
1320
|
+
if node_exec.status == TaskStatus.COMPLETED:
|
|
1321
|
+
# Success - remove from DLQ
|
|
1322
|
+
await self.cache.remove_from_dlq(entry_id)
|
|
1323
|
+
logger.info("DLQ replay succeeded",
|
|
1324
|
+
entry_id=entry_id,
|
|
1325
|
+
node_id=entry.node_id)
|
|
1326
|
+
|
|
1327
|
+
return {
|
|
1328
|
+
"success": True,
|
|
1329
|
+
"execution_id": ctx.execution_id,
|
|
1330
|
+
"node_id": entry.node_id,
|
|
1331
|
+
"result": node_exec.output,
|
|
1332
|
+
"removed_from_dlq": True,
|
|
1333
|
+
}
|
|
1334
|
+
else:
|
|
1335
|
+
# Still failing - update DLQ entry
|
|
1336
|
+
await self.cache.update_dlq_entry(
|
|
1337
|
+
entry_id,
|
|
1338
|
+
entry.retry_count + 1,
|
|
1339
|
+
node_exec.error or "Unknown error"
|
|
1340
|
+
)
|
|
1341
|
+
|
|
1342
|
+
return {
|
|
1343
|
+
"success": False,
|
|
1344
|
+
"execution_id": ctx.execution_id,
|
|
1345
|
+
"node_id": entry.node_id,
|
|
1346
|
+
"error": node_exec.error,
|
|
1347
|
+
"retry_count": entry.retry_count + 1,
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
finally:
|
|
1351
|
+
self._active_contexts.pop(ctx.execution_id, None)
|